From c1068500bdca6bdd93c6cd24c3de34b2c0b6548f Mon Sep 17 00:00:00 2001 From: Alfred Date: Mon, 16 Mar 2026 07:30:31 +0100 Subject: [PATCH] feat: follow embedded players --- cmd/radiostreamscan/main.go | 58 +++++++++++++++++++- internal/extractor/extractor.go | 82 ++++++++++++++++++++-------- internal/extractor/extractor_test.go | 29 +++++++++- 3 files changed, 141 insertions(+), 28 deletions(-) diff --git a/cmd/radiostreamscan/main.go b/cmd/radiostreamscan/main.go index dd87f7a..348f95c 100644 --- a/cmd/radiostreamscan/main.go +++ b/cmd/radiostreamscan/main.go @@ -319,7 +319,6 @@ func scanOneURL(client *http.Client, cfg *config, raw string) scanResult { streams := extractor.ExtractStreams(html) playlists := extractor.ExtractPlaylistLinks(html) - res.Playlists = playlists for _, pl := range playlists { plContent, plType, err := fetchContent(client, cfg, pl) @@ -333,6 +332,41 @@ func scanOneURL(client *http.Client, cfg *config, raw string) scanResult { } } + embedURLs := extractor.ExtractEmbedURLs(html) + seenEmbeds := make(map[string]struct{}) + for _, embed := range embedURLs { + embedURL := resolveURL(raw, embed) + if embedURL == "" || embedURL == raw { + continue + } + if _, ok := seenEmbeds[embedURL]; ok { + continue + } + seenEmbeds[embedURL] = struct{}{} + + embedHTML, _, err := fetchContent(client, cfg, embedURL) + if err != nil { + continue + } + + streams = append(streams, extractor.ExtractStreams(embedHTML)...) + embedPlaylists := extractor.ExtractPlaylistLinks(embedHTML) + playlists = append(playlists, embedPlaylists...) + + for _, pl := range embedPlaylists { + plContent, plType, err := fetchContent(client, cfg, pl) + if err != nil { + continue + } + parsed := extractor.ParsePlaylist(plContent, plType) + if len(parsed) > 0 { + streams = append(streams, parsed...) + res.FromPlaylist = true + } + } + } + + res.Playlists = uniqueStrings(playlists) res.Streams = uniqueStrings(streams) if cfg.Probe { @@ -469,6 +503,28 @@ func uniqueStrings(values []string) []string { return out } +func resolveURL(base, href string) string { + href = strings.TrimSpace(href) + if href == "" { + return "" + } + if strings.HasPrefix(href, "//") { + return "https:" + href + } + parsed, err := url.Parse(href) + if err != nil { + return "" + } + if parsed.IsAbs() { + return parsed.String() + } + baseURL, err := url.Parse(base) + if err != nil { + return parsed.String() + } + return baseURL.ResolveReference(parsed).String() +} + type historyWriter struct { path string mu sync.Mutex diff --git a/internal/extractor/extractor.go b/internal/extractor/extractor.go index 94023eb..e57d9e7 100644 --- a/internal/extractor/extractor.go +++ b/internal/extractor/extractor.go @@ -9,25 +9,25 @@ import ( var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json))`) var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`) var srcPattern = regexp.MustCompile(`(?i)src\s*=\s*['"]([^'"]+)['"]`) +var iframePattern = regexp.MustCompile(`(?i)]+src\s*=\s*['"]([^'"]+)['"]`) +var audioPattern = regexp.MustCompile(`(?i)]+src\s*=\s*['"]([^'"]+)['"]`) +var sourcePattern = regexp.MustCompile(`(?i)]+src\s*=\s*['"]([^'"]+)['"]`) var xspfPattern = regexp.MustCompile(`(?i)([^<]+)`) // ExtractStreams returns the unique streaming URLs found in the provided HTML/text. func ExtractStreams(data string) []string { candidates := make(map[string]struct{}) + special := make(map[string]struct{}) add := func(raw string) { - raw = strings.TrimSpace(raw) - if raw == "" { - return - } - if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//")) { - return + if normalized, ok := normalizeCandidate(raw); ok { + candidates[normalized] = struct{}{} } - if strings.HasPrefix(raw, "//") { - raw = "https:" + raw + } + addSpecial := func(raw string) { + if normalized, ok := normalizeCandidate(raw); ok { + candidates[normalized] = struct{}{} + special[normalized] = struct{}{} } - normalized := strings.TrimRight(raw, "+") - normalized = strings.ReplaceAll(normalized, `\\`, "") - candidates[normalized] = struct{}{} } for _, match := range urlPattern.FindAllStringSubmatch(data, -1) { @@ -39,11 +39,21 @@ func ExtractStreams(data string) []string { for _, match := range srcPattern.FindAllStringSubmatch(data, -1) { add(match[1]) } + for _, match := range audioPattern.FindAllStringSubmatch(data, -1) { + addSpecial(match[1]) + } + for _, match := range sourcePattern.FindAllStringSubmatch(data, -1) { + addSpecial(match[1]) + } streams := make([]string, 0, len(candidates)) for u := range candidates { if isStreamURL(u) { streams = append(streams, u) + continue + } + if _, ok := special[u]; ok { + streams = append(streams, u) } } sort.Strings(streams) @@ -54,19 +64,7 @@ func ExtractStreams(data string) []string { func ExtractPlaylistLinks(data string) []string { candidates := make(map[string]struct{}) add := func(raw string) { - raw = strings.TrimSpace(raw) - if raw == "" { - return - } - if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//")) { - return - } - if strings.HasPrefix(raw, "//") { - raw = "https:" + raw - } - normalized := strings.TrimRight(raw, "+") - normalized = strings.ReplaceAll(normalized, `\\`, "") - if isPlaylistURL(normalized) { + if normalized, ok := normalizeCandidate(raw); ok && isPlaylistURL(normalized) { candidates[normalized] = struct{}{} } } @@ -89,6 +87,23 @@ func ExtractPlaylistLinks(data string) []string { return links } +// ExtractEmbedURLs returns URLs found in iframe embeds. +func ExtractEmbedURLs(data string) []string { + candidates := make(map[string]struct{}) + for _, match := range iframePattern.FindAllStringSubmatch(data, -1) { + if normalized, ok := normalizeCandidate(match[1]); ok { + candidates[normalized] = struct{}{} + } + } + + urls := make([]string, 0, len(candidates)) + for u := range candidates { + urls = append(urls, u) + } + sort.Strings(urls) + return urls +} + // ParsePlaylist extracts stream URLs from playlist content. func ParsePlaylist(content string, contentType string) []string { candidates := make(map[string]struct{}) @@ -149,6 +164,25 @@ func ParsePlaylist(content string, contentType string) []string { return streams } +func normalizeCandidate(raw string) (string, bool) { + raw = strings.TrimSpace(raw) + if raw == "" { + return "", false + } + if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//")) { + return "", false + } + if strings.HasPrefix(raw, "//") { + raw = "https:" + raw + } + normalized := strings.TrimRight(raw, "+") + normalized = strings.ReplaceAll(normalized, `\\`, "") + if normalized == "" { + return "", false + } + return normalized, true +} + func isStreamURL(u string) bool { lower := strings.ToLower(u) return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") || diff --git a/internal/extractor/extractor_test.go b/internal/extractor/extractor_test.go index c1747fd..4b25715 100644 --- a/internal/extractor/extractor_test.go +++ b/internal/extractor/extractor_test.go @@ -1,6 +1,9 @@ package extractor -import "testing" +import ( + "reflect" + "testing" +) func TestExtractStreams(t *testing.T) { html := ` @@ -11,13 +14,24 @@ func TestExtractStreams(t *testing.T) { listen +
` streams := ExtractStreams(html) - if len(streams) != 6 { - t.Fatalf("wanted 6 streams, got %d: %v", len(streams), streams) + if len(streams) != 7 { + t.Fatalf("wanted 7 streams, got %d: %v", len(streams), streams) + } + found := false + for _, s := range streams { + if s == "https://stream.example.com/live" { + found = true + break + } + } + if !found { + t.Fatalf("expected audio tag stream to be present: %v", streams) } } @@ -34,6 +48,15 @@ func TestExtractPlaylistLinks(t *testing.T) { } } +func TestExtractEmbedURLs(t *testing.T) { + html := `` + urls := ExtractEmbedURLs(html) + want := []string{"https://example.com/embed", "https://example.org/player"} + if !reflect.DeepEqual(urls, want) { + t.Fatalf("wanted iframe URLs %v, got %v", want, urls) + } +} + func TestParsePlaylist(t *testing.T) { m3u := "#EXTM3U\nhttps://example.com/live.mp3\n" pls := "[playlist]\nFile1=https://example.com/stream.aac\n"