Browse Source

feat: follow embedded players

master
Alfred 5 days ago
parent
commit
c1068500bd
3 changed files with 141 additions and 28 deletions
  1. +57
    -1
      cmd/radiostreamscan/main.go
  2. +58
    -24
      internal/extractor/extractor.go
  3. +26
    -3
      internal/extractor/extractor_test.go

+ 57
- 1
cmd/radiostreamscan/main.go View File

@@ -319,7 +319,6 @@ func scanOneURL(client *http.Client, cfg *config, raw string) scanResult {


streams := extractor.ExtractStreams(html) streams := extractor.ExtractStreams(html)
playlists := extractor.ExtractPlaylistLinks(html) playlists := extractor.ExtractPlaylistLinks(html)
res.Playlists = playlists


for _, pl := range playlists { for _, pl := range playlists {
plContent, plType, err := fetchContent(client, cfg, pl) plContent, plType, err := fetchContent(client, cfg, pl)
@@ -333,6 +332,41 @@ func scanOneURL(client *http.Client, cfg *config, raw string) scanResult {
} }
} }


embedURLs := extractor.ExtractEmbedURLs(html)
seenEmbeds := make(map[string]struct{})
for _, embed := range embedURLs {
embedURL := resolveURL(raw, embed)
if embedURL == "" || embedURL == raw {
continue
}
if _, ok := seenEmbeds[embedURL]; ok {
continue
}
seenEmbeds[embedURL] = struct{}{}

embedHTML, _, err := fetchContent(client, cfg, embedURL)
if err != nil {
continue
}

streams = append(streams, extractor.ExtractStreams(embedHTML)...)
embedPlaylists := extractor.ExtractPlaylistLinks(embedHTML)
playlists = append(playlists, embedPlaylists...)

for _, pl := range embedPlaylists {
plContent, plType, err := fetchContent(client, cfg, pl)
if err != nil {
continue
}
parsed := extractor.ParsePlaylist(plContent, plType)
if len(parsed) > 0 {
streams = append(streams, parsed...)
res.FromPlaylist = true
}
}
}

res.Playlists = uniqueStrings(playlists)
res.Streams = uniqueStrings(streams) res.Streams = uniqueStrings(streams)


if cfg.Probe { if cfg.Probe {
@@ -469,6 +503,28 @@ func uniqueStrings(values []string) []string {
return out return out
} }


func resolveURL(base, href string) string {
href = strings.TrimSpace(href)
if href == "" {
return ""
}
if strings.HasPrefix(href, "//") {
return "https:" + href
}
parsed, err := url.Parse(href)
if err != nil {
return ""
}
if parsed.IsAbs() {
return parsed.String()
}
baseURL, err := url.Parse(base)
if err != nil {
return parsed.String()
}
return baseURL.ResolveReference(parsed).String()
}

type historyWriter struct { type historyWriter struct {
path string path string
mu sync.Mutex mu sync.Mutex


+ 58
- 24
internal/extractor/extractor.go View File

@@ -9,25 +9,25 @@ import (
var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json))`) var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json))`)
var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`) var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`)
var srcPattern = regexp.MustCompile(`(?i)src\s*=\s*['"]([^'"]+)['"]`) var srcPattern = regexp.MustCompile(`(?i)src\s*=\s*['"]([^'"]+)['"]`)
var iframePattern = regexp.MustCompile(`(?i)<iframe[^>]+src\s*=\s*['"]([^'"]+)['"]`)
var audioPattern = regexp.MustCompile(`(?i)<audio[^>]+src\s*=\s*['"]([^'"]+)['"]`)
var sourcePattern = regexp.MustCompile(`(?i)<source[^>]+src\s*=\s*['"]([^'"]+)['"]`)
var xspfPattern = regexp.MustCompile(`(?i)<location>([^<]+)</location>`) var xspfPattern = regexp.MustCompile(`(?i)<location>([^<]+)</location>`)


// ExtractStreams returns the unique streaming URLs found in the provided HTML/text. // ExtractStreams returns the unique streaming URLs found in the provided HTML/text.
func ExtractStreams(data string) []string { func ExtractStreams(data string) []string {
candidates := make(map[string]struct{}) candidates := make(map[string]struct{})
special := make(map[string]struct{})
add := func(raw string) { add := func(raw string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return
}
if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//")) {
return
if normalized, ok := normalizeCandidate(raw); ok {
candidates[normalized] = struct{}{}
} }
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
addSpecial := func(raw string) {
if normalized, ok := normalizeCandidate(raw); ok {
candidates[normalized] = struct{}{}
special[normalized] = struct{}{}
} }
normalized := strings.TrimRight(raw, "+")
normalized = strings.ReplaceAll(normalized, `\\`, "")
candidates[normalized] = struct{}{}
} }


for _, match := range urlPattern.FindAllStringSubmatch(data, -1) { for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
@@ -39,11 +39,21 @@ func ExtractStreams(data string) []string {
for _, match := range srcPattern.FindAllStringSubmatch(data, -1) { for _, match := range srcPattern.FindAllStringSubmatch(data, -1) {
add(match[1]) add(match[1])
} }
for _, match := range audioPattern.FindAllStringSubmatch(data, -1) {
addSpecial(match[1])
}
for _, match := range sourcePattern.FindAllStringSubmatch(data, -1) {
addSpecial(match[1])
}


streams := make([]string, 0, len(candidates)) streams := make([]string, 0, len(candidates))
for u := range candidates { for u := range candidates {
if isStreamURL(u) { if isStreamURL(u) {
streams = append(streams, u) streams = append(streams, u)
continue
}
if _, ok := special[u]; ok {
streams = append(streams, u)
} }
} }
sort.Strings(streams) sort.Strings(streams)
@@ -54,19 +64,7 @@ func ExtractStreams(data string) []string {
func ExtractPlaylistLinks(data string) []string { func ExtractPlaylistLinks(data string) []string {
candidates := make(map[string]struct{}) candidates := make(map[string]struct{})
add := func(raw string) { add := func(raw string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return
}
if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//")) {
return
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
normalized := strings.TrimRight(raw, "+")
normalized = strings.ReplaceAll(normalized, `\\`, "")
if isPlaylistURL(normalized) {
if normalized, ok := normalizeCandidate(raw); ok && isPlaylistURL(normalized) {
candidates[normalized] = struct{}{} candidates[normalized] = struct{}{}
} }
} }
@@ -89,6 +87,23 @@ func ExtractPlaylistLinks(data string) []string {
return links return links
} }


// ExtractEmbedURLs returns URLs found in iframe embeds.
func ExtractEmbedURLs(data string) []string {
candidates := make(map[string]struct{})
for _, match := range iframePattern.FindAllStringSubmatch(data, -1) {
if normalized, ok := normalizeCandidate(match[1]); ok {
candidates[normalized] = struct{}{}
}
}

urls := make([]string, 0, len(candidates))
for u := range candidates {
urls = append(urls, u)
}
sort.Strings(urls)
return urls
}

// ParsePlaylist extracts stream URLs from playlist content. // ParsePlaylist extracts stream URLs from playlist content.
func ParsePlaylist(content string, contentType string) []string { func ParsePlaylist(content string, contentType string) []string {
candidates := make(map[string]struct{}) candidates := make(map[string]struct{})
@@ -149,6 +164,25 @@ func ParsePlaylist(content string, contentType string) []string {
return streams return streams
} }


func normalizeCandidate(raw string) (string, bool) {
raw = strings.TrimSpace(raw)
if raw == "" {
return "", false
}
if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//")) {
return "", false
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
normalized := strings.TrimRight(raw, "+")
normalized = strings.ReplaceAll(normalized, `\\`, "")
if normalized == "" {
return "", false
}
return normalized, true
}

func isStreamURL(u string) bool { func isStreamURL(u string) bool {
lower := strings.ToLower(u) lower := strings.ToLower(u)
return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") || return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") ||


+ 26
- 3
internal/extractor/extractor_test.go View File

@@ -1,6 +1,9 @@
package extractor package extractor


import "testing"
import (
"reflect"
"testing"
)


func TestExtractStreams(t *testing.T) { func TestExtractStreams(t *testing.T) {
html := ` html := `
@@ -11,13 +14,24 @@ func TestExtractStreams(t *testing.T) {
</script> </script>
<a href="https://streams.example.org/radio.aac?user=test">listen</a> <a href="https://streams.example.org/radio.aac?user=test">listen</a>
<source src="//players.example.eu/ambient.ogg" type="audio/ogg" /> <source src="//players.example.eu/ambient.ogg" type="audio/ogg" />
<audio src="https://stream.example.com/live"></audio>
<audio data-src="https://pod.example.com/episode.opus"></audio> <audio data-src="https://pod.example.com/episode.opus"></audio>
<div data-value="https://example.com/secret.pls"></div> <div data-value="https://example.com/secret.pls"></div>
` `


streams := ExtractStreams(html) streams := ExtractStreams(html)
if len(streams) != 6 {
t.Fatalf("wanted 6 streams, got %d: %v", len(streams), streams)
if len(streams) != 7 {
t.Fatalf("wanted 7 streams, got %d: %v", len(streams), streams)
}
found := false
for _, s := range streams {
if s == "https://stream.example.com/live" {
found = true
break
}
}
if !found {
t.Fatalf("expected audio tag stream to be present: %v", streams)
} }
} }


@@ -34,6 +48,15 @@ func TestExtractPlaylistLinks(t *testing.T) {
} }
} }


func TestExtractEmbedURLs(t *testing.T) {
html := `<iframe src="//example.com/embed"></iframe><iframe src="https://example.org/player"></iframe>`
urls := ExtractEmbedURLs(html)
want := []string{"https://example.com/embed", "https://example.org/player"}
if !reflect.DeepEqual(urls, want) {
t.Fatalf("wanted iframe URLs %v, got %v", want, urls)
}
}

func TestParsePlaylist(t *testing.T) { func TestParsePlaylist(t *testing.T) {
m3u := "#EXTM3U\nhttps://example.com/live.mp3\n" m3u := "#EXTM3U\nhttps://example.com/live.mp3\n"
pls := "[playlist]\nFile1=https://example.com/stream.aac\n" pls := "[playlist]\nFile1=https://example.com/stream.aac\n"


Loading…
Cancel
Save