Bläddra i källkod

feat: follow script assets

master
Alfred 5 dagar sedan
förälder
incheckning
3058ffc2ad
3 ändrade filer med 662 tillägg och 580 borttagningar
  1. +430
    -381
      cmd/radiostreamscan/main.go
  2. +183
    -159
      internal/extractor/extractor.go
  3. +49
    -40
      internal/extractor/extractor_test.go

+ 430
- 381
cmd/radiostreamscan/main.go Visa fil

@@ -1,114 +1,114 @@
package main

import (
"bufio"
"encoding/csv"
"encoding/json"
"flag"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
"sync"
"time"
"radio-stream-extractor/internal/extractor"
"bufio"
"encoding/csv"
"encoding/json"
"flag"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
"sync"
"time"
"radio-stream-extractor/internal/extractor"
)

type scanResult struct {
URL string `json:"url"`
Streams []string `json:"streams"`
Playlists []string `json:"playlists,omitempty"`
Probes []probeResult `json:"probes,omitempty"`
Error string `json:"error,omitempty"`
FetchedAt time.Time `json:"fetchedAt"`
FromPlaylist bool `json:"fromPlaylist"`
URL string `json:"url"`
Streams []string `json:"streams"`
Playlists []string `json:"playlists,omitempty"`
Probes []probeResult `json:"probes,omitempty"`
Error string `json:"error,omitempty"`
FetchedAt time.Time `json:"fetchedAt"`
FromPlaylist bool `json:"fromPlaylist"`
}

type probeResult struct {
URL string `json:"url"`
Status string `json:"status"`
ContentType string `json:"contentType,omitempty"`
URL string `json:"url"`
Status string `json:"status"`
ContentType string `json:"contentType,omitempty"`
}

type config struct {
Format string
Probe bool
Headers headerList
Proxy string
HistoryPath string
Watch time.Duration
Concurrency int
Format string
Probe bool
Headers headerList
Proxy string
HistoryPath string
Watch time.Duration
Concurrency int
}

type headerList []string

func (h *headerList) String() string { return strings.Join(*h, ", ") }
func (h *headerList) Set(v string) error {
*h = append(*h, v)
return nil
*h = append(*h, v)
return nil
}

func main() {
port := flag.String("port", ":8080", "listen address for the web server (default :8080)")
web := flag.Bool("web", false, "force web-server mode even when URLs are provided")
cfg := config{}
flag.StringVar(&cfg.Format, "format", "text", "output format: text|json|csv|pls")
flag.BoolVar(&cfg.Probe, "probe", true, "probe discovered stream URLs with HTTP HEAD")
flag.Var(&cfg.Headers, "header", "custom HTTP header (repeatable), e.g. -header 'Referer: https://example.com'")
flag.StringVar(&cfg.Proxy, "proxy", "", "HTTP proxy URL (optional)")
flag.StringVar(&cfg.HistoryPath, "history", "history.jsonl", "path to JSONL history log (empty to disable)")
flag.DurationVar(&cfg.Watch, "watch", 0, "repeat scan in CLI mode at interval (e.g. 30s, 2m)")
flag.IntVar(&cfg.Concurrency, "concurrency", 4, "number of concurrent fetch workers")
flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] <url> [url...]\n", os.Args[0])
flag.PrintDefaults()
}
flag.Parse()
urls := flag.Args()
client := newHTTPClient(cfg.Proxy)
history := newHistoryWriter(cfg.HistoryPath)
if *web || len(urls) == 0 {
if err := runWebMode(*port, client, &cfg, history); err != nil {
fmt.Fprintf(os.Stderr, "web mode failed: %v\n", err)
os.Exit(1)
}
return
}
runCLIMode(urls, client, &cfg, history)
port := flag.String("port", ":8080", "listen address for the web server (default :8080)")
web := flag.Bool("web", false, "force web-server mode even when URLs are provided")
cfg := config{}
flag.StringVar(&cfg.Format, "format", "text", "output format: text|json|csv|pls")
flag.BoolVar(&cfg.Probe, "probe", true, "probe discovered stream URLs with HTTP HEAD")
flag.Var(&cfg.Headers, "header", "custom HTTP header (repeatable), e.g. -header 'Referer: https://example.com'")
flag.StringVar(&cfg.Proxy, "proxy", "", "HTTP proxy URL (optional)")
flag.StringVar(&cfg.HistoryPath, "history", "history.jsonl", "path to JSONL history log (empty to disable)")
flag.DurationVar(&cfg.Watch, "watch", 0, "repeat scan in CLI mode at interval (e.g. 30s, 2m)")
flag.IntVar(&cfg.Concurrency, "concurrency", 4, "number of concurrent fetch workers")
flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] <url> [url...]\n", os.Args[0])
flag.PrintDefaults()
}
flag.Parse()
urls := flag.Args()
client := newHTTPClient(cfg.Proxy)
history := newHistoryWriter(cfg.HistoryPath)
if *web || len(urls) == 0 {
if err := runWebMode(*port, client, &cfg, history); err != nil {
fmt.Fprintf(os.Stderr, "web mode failed: %v\n", err)
os.Exit(1)
}
return
}
runCLIMode(urls, client, &cfg, history)
}

func runCLIMode(urls []string, client *http.Client, cfg *config, history *historyWriter) {
for {
results := scanURLs(urls, client, cfg)
outputResults(results, cfg.Format, os.Stdout)
history.Write(results)
if cfg.Watch == 0 {
return
}
time.Sleep(cfg.Watch)
}
for {
results := scanURLs(urls, client, cfg)
outputResults(results, cfg.Format, os.Stdout)
history.Write(results)
if cfg.Watch == 0 {
return
}
time.Sleep(cfg.Watch)
}
}

func runWebMode(addr string, client *http.Client, cfg *config, history *historyWriter) error {
mux := http.NewServeMux()
mux.HandleFunc("/", indexHandler)
mux.HandleFunc("/scan", makeScanHandler(client, cfg, history))
mux.HandleFunc("/watch", watchHandler)
mux := http.NewServeMux()
mux.HandleFunc("/", indexHandler)
mux.HandleFunc("/scan", makeScanHandler(client, cfg, history))
mux.HandleFunc("/watch", watchHandler)

fmt.Printf("radiostreamscan listening on %s (GET /scan?url=... or POST url=...)\n", addr)
return http.ListenAndServe(addr, mux)
fmt.Printf("radiostreamscan listening on %s (GET /scan?url=... or POST url=...)\n", addr)
return http.ListenAndServe(addr, mux)
}

func indexHandler(w http.ResponseWriter, r *http.Request) {
fmt.Fprintf(w, `<!doctype html>
fmt.Fprintf(w, `<!doctype html>
<html>
<head><meta charset="utf-8"><title>radiostreamscan</title></head>
<body>
@@ -136,10 +136,10 @@ func indexHandler(w http.ResponseWriter, r *http.Request) {
}

func watchHandler(w http.ResponseWriter, r *http.Request) {
urls := normalizeURLInputs(r.URL.Query()["url"])
interval := r.URL.Query().Get("interval")
probe := r.URL.Query().Get("probe")
fmt.Fprintf(w, `<!doctype html>
urls := normalizeURLInputs(r.URL.Query()["url"])
interval := r.URL.Query().Get("interval")
probe := r.URL.Query().Get("probe")
fmt.Fprintf(w, `<!doctype html>
<html>
<head><meta charset="utf-8"><title>radiostreamscan results</title>
<style>
@@ -224,337 +224,386 @@ func watchHandler(w http.ResponseWriter, r *http.Request) {
}

func makeScanHandler(client *http.Client, cfg *config, history *historyWriter) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var urls []string
switch r.Method {
case http.MethodGet:
urls = r.URL.Query()["url"]
case http.MethodPost:
if err := r.ParseForm(); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
urls = r.Form["url"]
default:
http.Error(w, "only GET and POST supported", http.StatusMethodNotAllowed)
return
}
urls = normalizeURLInputs(urls)
if len(urls) == 0 {
http.Error(w, "provide at least one url parameter", http.StatusBadRequest)
return
}
localCfg := *cfg
if r.URL.Query().Get("probe") == "1" {
localCfg.Probe = true
} else if r.URL.Query().Get("probe") == "0" {
localCfg.Probe = false
}
if f := r.URL.Query().Get("format"); f != "" {
localCfg.Format = f
}
results := scanURLs(urls, client, &localCfg)
history.Write(results)
outputResults(results, localCfg.Format, w)
}
return func(w http.ResponseWriter, r *http.Request) {
var urls []string
switch r.Method {
case http.MethodGet:
urls = r.URL.Query()["url"]
case http.MethodPost:
if err := r.ParseForm(); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
urls = r.Form["url"]
default:
http.Error(w, "only GET and POST supported", http.StatusMethodNotAllowed)
return
}
urls = normalizeURLInputs(urls)
if len(urls) == 0 {
http.Error(w, "provide at least one url parameter", http.StatusBadRequest)
return
}
localCfg := *cfg
if r.URL.Query().Get("probe") == "1" {
localCfg.Probe = true
} else if r.URL.Query().Get("probe") == "0" {
localCfg.Probe = false
}
if f := r.URL.Query().Get("format"); f != "" {
localCfg.Format = f
}
results := scanURLs(urls, client, &localCfg)
history.Write(results)
outputResults(results, localCfg.Format, w)
}
}

func normalizeURLInputs(inputs []string) []string {
var urls []string
for _, item := range inputs {
for _, line := range strings.Split(item, "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
urls = append(urls, line)
}
}
return urls
var urls []string
for _, item := range inputs {
for _, line := range strings.Split(item, "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
urls = append(urls, line)
}
}
return urls
}

func scanURLs(urls []string, client *http.Client, cfg *config) []scanResult {
results := make([]scanResult, len(urls))
type job struct {
index int
url string
}
jobs := make(chan job)
var wg sync.WaitGroup
workers := cfg.Concurrency
if workers < 1 {
workers = 1
}
for i := 0; i < workers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := range jobs {
res := scanOneURL(client, cfg, j.url)
results[j.index] = res
}
}()
}
for i, u := range urls {
jobs <- job{index: i, url: u}
}
close(jobs)
wg.Wait()
return results
results := make([]scanResult, len(urls))
type job struct {
index int
url string
}
jobs := make(chan job)
var wg sync.WaitGroup
workers := cfg.Concurrency
if workers < 1 {
workers = 1
}
for i := 0; i < workers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := range jobs {
res := scanOneURL(client, cfg, j.url)
results[j.index] = res
}
}()
}
for i, u := range urls {
jobs <- job{index: i, url: u}
}
close(jobs)
wg.Wait()
return results
}

func scanOneURL(client *http.Client, cfg *config, raw string) scanResult {
res := scanResult{URL: raw, FetchedAt: time.Now()}
html, contentType, err := fetchContent(client, cfg, raw)
if err != nil {
res.Error = err.Error()
return res
}

streams := extractor.ExtractStreams(html)
playlists := extractor.ExtractPlaylistLinks(html)

for _, pl := range playlists {
plContent, plType, err := fetchContent(client, cfg, pl)
if err != nil {
continue
}
parsed := extractor.ParsePlaylist(plContent, plType)
if len(parsed) > 0 {
streams = append(streams, parsed...)
res.FromPlaylist = true
}
}

embedURLs := extractor.ExtractEmbedURLs(html)
seenEmbeds := make(map[string]struct{})
for _, embed := range embedURLs {
embedURL := resolveURL(raw, embed)
if embedURL == "" || embedURL == raw {
continue
}
if _, ok := seenEmbeds[embedURL]; ok {
continue
}
seenEmbeds[embedURL] = struct{}{}

embedHTML, _, err := fetchContent(client, cfg, embedURL)
if err != nil {
continue
}

streams = append(streams, extractor.ExtractStreams(embedHTML)...)
embedPlaylists := extractor.ExtractPlaylistLinks(embedHTML)
playlists = append(playlists, embedPlaylists...)

for _, pl := range embedPlaylists {
plContent, plType, err := fetchContent(client, cfg, pl)
if err != nil {
continue
}
parsed := extractor.ParsePlaylist(plContent, plType)
if len(parsed) > 0 {
streams = append(streams, parsed...)
res.FromPlaylist = true
}
}
}

res.Playlists = uniqueStrings(playlists)
res.Streams = uniqueStrings(streams)

if cfg.Probe {
res.Probes = probeStreams(client, cfg, res.Streams)
}

_ = contentType
return res
res := scanResult{URL: raw, FetchedAt: time.Now()}
html, contentType, err := fetchContent(client, cfg, raw)
if err != nil {
res.Error = err.Error()
return res
}

parsedBase, _ := url.Parse(raw)
baseHost := ""
if parsedBase != nil {
baseHost = parsedBase.Hostname()
}

streams := extractor.ExtractStreams(html)
playlists := extractor.ExtractPlaylistLinks(html)

for _, pl := range playlists {
plContent, plType, err := fetchContent(client, cfg, pl)
if err != nil {
continue
}
parsed := extractor.ParsePlaylist(plContent, plType)
if len(parsed) > 0 {
streams = append(streams, parsed...)
res.FromPlaylist = true
}
}

embedURLs := extractor.ExtractEmbedURLs(html)
seenEmbeds := make(map[string]struct{})
for _, embed := range embedURLs {
embedURL := resolveURL(raw, embed)
if embedURL == "" || embedURL == raw {
continue
}
if _, ok := seenEmbeds[embedURL]; ok {
continue
}
seenEmbeds[embedURL] = struct{}{}

embedHTML, _, err := fetchContent(client, cfg, embedURL)
if err != nil {
continue
}

streams = append(streams, extractor.ExtractStreams(embedHTML)...)
embedPlaylists := extractor.ExtractPlaylistLinks(embedHTML)
playlists = append(playlists, embedPlaylists...)

for _, pl := range embedPlaylists {
plContent, plType, err := fetchContent(client, cfg, pl)
if err != nil {
continue
}
parsed := extractor.ParsePlaylist(plContent, plType)
if len(parsed) > 0 {
streams = append(streams, parsed...)
res.FromPlaylist = true
}
}
}

scriptURLs := extractor.ExtractScriptURLs(html)
seenScripts := make(map[string]struct{})
for _, script := range scriptURLs {
scriptURL := resolveURL(raw, script)
if scriptURL == "" || scriptURL == raw {
continue
}
if baseHost != "" {
parsedScript, err := url.Parse(scriptURL)
if err != nil {
continue
}
if parsedScript.Hostname() != "" && parsedScript.Hostname() != baseHost {
continue
}
}
if _, ok := seenScripts[scriptURL]; ok {
continue
}
seenScripts[scriptURL] = struct{}{}

scriptHTML, _, err := fetchContent(client, cfg, scriptURL)
if err != nil {
continue
}

streams = append(streams, extractor.ExtractStreams(scriptHTML)...)
scriptPlaylists := extractor.ExtractPlaylistLinks(scriptHTML)
playlists = append(playlists, scriptPlaylists...)

for _, pl := range scriptPlaylists {
plContent, plType, err := fetchContent(client, cfg, pl)
if err != nil {
continue
}
parsed := extractor.ParsePlaylist(plContent, plType)
if len(parsed) > 0 {
streams = append(streams, parsed...)
res.FromPlaylist = true
}
}
}

res.Playlists = uniqueStrings(playlists)
res.Streams = uniqueStrings(streams)

if cfg.Probe {
res.Probes = probeStreams(client, cfg, res.Streams)
}

_ = contentType
return res
}

func fetchContent(client *http.Client, cfg *config, raw string) (string, string, error) {
req, err := http.NewRequest(http.MethodGet, raw, nil)
if err != nil {
return "", "", err
}
req.Header.Set("User-Agent", "radiostreamscan/0.2")
for _, h := range cfg.Headers {
parts := strings.SplitN(h, ":", 2)
if len(parts) == 2 {
req.Header.Set(strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]))
}
}

resp, err := client.Do(req)
if err != nil {
return "", "", err
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return "", "", fmt.Errorf("unexpected status %s", resp.Status)
}

body, err := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
if err != nil {
return "", "", err
}

return string(body), resp.Header.Get("Content-Type"), nil
req, err := http.NewRequest(http.MethodGet, raw, nil)
if err != nil {
return "", "", err
}
req.Header.Set("User-Agent", "radiostreamscan/0.2")
for _, h := range cfg.Headers {
parts := strings.SplitN(h, ":", 2)
if len(parts) == 2 {
req.Header.Set(strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]))
}
}
resp, err := client.Do(req)
if err != nil {
return "", "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", "", fmt.Errorf("unexpected status %s", resp.Status)
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
if err != nil {
return "", "", err
}
return string(body), resp.Header.Get("Content-Type"), nil
}

func probeStreams(client *http.Client, cfg *config, streams []string) []probeResult {
var results []probeResult
for _, s := range streams {
req, err := http.NewRequest(http.MethodHead, s, nil)
if err != nil {
continue
}
for _, h := range cfg.Headers {
parts := strings.SplitN(h, ":", 2)
if len(parts) == 2 {
req.Header.Set(strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]))
}
}
resp, err := client.Do(req)
if err != nil {
results = append(results, probeResult{URL: s, Status: err.Error()})
continue
}
resp.Body.Close()
results = append(results, probeResult{URL: s, Status: resp.Status, ContentType: resp.Header.Get("Content-Type")})
}
return results
var results []probeResult
for _, s := range streams {
req, err := http.NewRequest(http.MethodHead, s, nil)
if err != nil {
continue
}
for _, h := range cfg.Headers {
parts := strings.SplitN(h, ":", 2)
if len(parts) == 2 {
req.Header.Set(strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]))
}
}
resp, err := client.Do(req)
if err != nil {
results = append(results, probeResult{URL: s, Status: err.Error()})
continue
}
resp.Body.Close()
results = append(results, probeResult{URL: s, Status: resp.Status, ContentType: resp.Header.Get("Content-Type")})
}
return results
}

func outputResults(results []scanResult, format string, w io.Writer) {
if rw, ok := w.(http.ResponseWriter); ok {
if strings.ToLower(format) == "json" {
rw.Header().Set("Content-Type", "application/json")
} else if strings.ToLower(format) == "csv" {
rw.Header().Set("Content-Type", "text/csv")
}
}
switch strings.ToLower(format) {
case "json":
json.NewEncoder(w).Encode(results)
case "csv":
cw := csv.NewWriter(w)
cw.Write([]string{"input_url", "stream_url"})
for _, res := range results {
for _, s := range res.Streams {
cw.Write([]string{res.URL, s})
}
}
cw.Flush()
case "pls":
fmt.Fprintln(w, "[playlist]")
i := 1
for _, res := range results {
for _, s := range res.Streams {
fmt.Fprintf(w, "File%d=%s\n", i, s)
i++
}
}
fmt.Fprintf(w, "NumberOfEntries=%d\nVersion=2\n", i-1)
default:
for _, res := range results {
fmt.Fprintf(w, "URL: %s\n", res.URL)
if res.Error != "" {
fmt.Fprintf(w, " error: %s\n", res.Error)
continue
}
if len(res.Streams) == 0 {
fmt.Fprintln(w, " (no candidate streams found)")
continue
}
for _, s := range res.Streams {
fmt.Fprintf(w, " - %s\n", s)
}
}
}
if rw, ok := w.(http.ResponseWriter); ok {
if strings.ToLower(format) == "json" {
rw.Header().Set("Content-Type", "application/json")
} else if strings.ToLower(format) == "csv" {
rw.Header().Set("Content-Type", "text/csv")
}
}
switch strings.ToLower(format) {
case "json":
json.NewEncoder(w).Encode(results)
case "csv":
cw := csv.NewWriter(w)
cw.Write([]string{"input_url", "stream_url"})
for _, res := range results {
for _, s := range res.Streams {
cw.Write([]string{res.URL, s})
}
}
cw.Flush()
case "pls":
fmt.Fprintln(w, "[playlist]")
i := 1
for _, res := range results {
for _, s := range res.Streams {
fmt.Fprintf(w, "File%d=%s\n", i, s)
i++
}
}
fmt.Fprintf(w, "NumberOfEntries=%d\nVersion=2\n", i-1)
default:
for _, res := range results {
fmt.Fprintf(w, "URL: %s\n", res.URL)
if res.Error != "" {
fmt.Fprintf(w, " error: %s\n", res.Error)
continue
}
if len(res.Streams) == 0 {
fmt.Fprintln(w, " (no candidate streams found)")
continue
}
for _, s := range res.Streams {
fmt.Fprintf(w, " - %s\n", s)
}
}
}
}

func newHTTPClient(proxyURL string) *http.Client {
transport := &http.Transport{}
if proxyURL != "" {
if parsed, err := url.Parse(proxyURL); err == nil {
transport.Proxy = http.ProxyURL(parsed)
}
}
return &http.Client{Timeout: 15 * time.Second, Transport: transport}
transport := &http.Transport{}
if proxyURL != "" {
if parsed, err := url.Parse(proxyURL); err == nil {
transport.Proxy = http.ProxyURL(parsed)
}
}
return &http.Client{Timeout: 15 * time.Second, Transport: transport}
}

func uniqueStrings(values []string) []string {
set := make(map[string]struct{})
for _, v := range values {
set[v] = struct{}{}
}
out := make([]string, 0, len(set))
for v := range set {
out = append(out, v)
}
return out
set := make(map[string]struct{})
for _, v := range values {
set[v] = struct{}{}
}
out := make([]string, 0, len(set))
for v := range set {
out = append(out, v)
}
return out
}

func resolveURL(base, href string) string {
href = strings.TrimSpace(href)
if href == "" {
return ""
}
if strings.HasPrefix(href, "//") {
return "https:" + href
}
parsed, err := url.Parse(href)
if err != nil {
return ""
}
if parsed.IsAbs() {
return parsed.String()
}
baseURL, err := url.Parse(base)
if err != nil {
return parsed.String()
}
return baseURL.ResolveReference(parsed).String()
href = strings.TrimSpace(href)
if href == "" {
return ""
}
if strings.HasPrefix(href, "//") {
return "https:" + href
}
parsed, err := url.Parse(href)
if err != nil {
return ""
}
if parsed.IsAbs() {
return parsed.String()
}
baseURL, err := url.Parse(base)
if err != nil {
return parsed.String()
}
return baseURL.ResolveReference(parsed).String()
}

type historyWriter struct {
path string
mu sync.Mutex
path string
mu sync.Mutex
}

func newHistoryWriter(path string) *historyWriter {
return &historyWriter{path: path}
return &historyWriter{path: path}
}

func (h *historyWriter) Write(results []scanResult) {
if h == nil || h.path == "" {
return
}
h.mu.Lock()
defer h.mu.Unlock()
f, err := os.OpenFile(h.path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
return
}
defer f.Close()
writer := bufio.NewWriter(f)
for _, res := range results {
data, err := json.Marshal(res)
if err != nil {
continue
}
writer.Write(data)
writer.WriteString("\n")
}
writer.Flush()
if h == nil || h.path == "" {
return
}
h.mu.Lock()
defer h.mu.Unlock()
f, err := os.OpenFile(h.path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
return
}
defer f.Close()
writer := bufio.NewWriter(f)
for _, res := range results {
data, err := json.Marshal(res)
if err != nil {
continue
}
writer.Write(data)
writer.WriteString("\n")
}
writer.Flush()
}

+ 183
- 159
internal/extractor/extractor.go Visa fil

@@ -1,196 +1,220 @@
package extractor

import (
"regexp"
"sort"
"strings"
"regexp"
"sort"
"strings"
)

var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json))`)
var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`)
var srcPattern = regexp.MustCompile(`(?i)src\s*=\s*['"]([^'"]+)['"]`)
var iframePattern = regexp.MustCompile(`(?i)<iframe[^>]+src\s*=\s*['"]([^'"]+)['"]`)
var scriptPattern = regexp.MustCompile(`(?i)<script[^>]+src\s*=\s*['"]([^'"]+)['"]`)
var audioPattern = regexp.MustCompile(`(?i)<audio[^>]+src\s*=\s*['"]([^'"]+)['"]`)
var sourcePattern = regexp.MustCompile(`(?i)<source[^>]+src\s*=\s*['"]([^'"]+)['"]`)
var xspfPattern = regexp.MustCompile(`(?i)<location>([^<]+)</location>`)

// ExtractStreams returns the unique streaming URLs found in the provided HTML/text.
func ExtractStreams(data string) []string {
candidates := make(map[string]struct{})
special := make(map[string]struct{})
add := func(raw string) {
if normalized, ok := normalizeCandidate(raw); ok {
candidates[normalized] = struct{}{}
}
}
addSpecial := func(raw string) {
if normalized, ok := normalizeCandidate(raw); ok {
candidates[normalized] = struct{}{}
special[normalized] = struct{}{}
}
}
for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
add(match[2])
}
for _, match := range srcPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
for _, match := range audioPattern.FindAllStringSubmatch(data, -1) {
addSpecial(match[1])
}
for _, match := range sourcePattern.FindAllStringSubmatch(data, -1) {
addSpecial(match[1])
}
streams := make([]string, 0, len(candidates))
for u := range candidates {
if isStreamURL(u) {
streams = append(streams, u)
continue
}
if _, ok := special[u]; ok {
streams = append(streams, u)
}
}
sort.Strings(streams)
return streams
candidates := make(map[string]struct{})
special := make(map[string]struct{})
add := func(raw string) {
if normalized, ok := normalizeCandidate(raw); ok {
candidates[normalized] = struct{}{}
}
}
addSpecial := func(raw string) {
if normalized, ok := normalizeCandidate(raw); ok {
candidates[normalized] = struct{}{}
special[normalized] = struct{}{}
}
}
for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
add(match[2])
}
for _, match := range srcPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
for _, match := range audioPattern.FindAllStringSubmatch(data, -1) {
addSpecial(match[1])
}
for _, match := range sourcePattern.FindAllStringSubmatch(data, -1) {
addSpecial(match[1])
}
streams := make([]string, 0, len(candidates))
for u := range candidates {
if isStreamURL(u) {
streams = append(streams, u)
continue
}
if _, ok := special[u]; ok {
streams = append(streams, u)
}
}
sort.Strings(streams)
return streams
}

// ExtractPlaylistLinks returns URLs likely pointing to playlists (m3u/pls/xspf/json).
func ExtractPlaylistLinks(data string) []string {
candidates := make(map[string]struct{})
add := func(raw string) {
if normalized, ok := normalizeCandidate(raw); ok && isPlaylistURL(normalized) {
candidates[normalized] = struct{}{}
}
}
for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
add(match[2])
}
for _, match := range srcPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
links := make([]string, 0, len(candidates))
for u := range candidates {
links = append(links, u)
}
sort.Strings(links)
return links
candidates := make(map[string]struct{})
add := func(raw string) {
if normalized, ok := normalizeCandidate(raw); ok && isPlaylistURL(normalized) {
candidates[normalized] = struct{}{}
}
}
for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
add(match[2])
}
for _, match := range srcPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
links := make([]string, 0, len(candidates))
for u := range candidates {
links = append(links, u)
}
sort.Strings(links)
return links
}

// ExtractEmbedURLs returns URLs found in iframe embeds.
func ExtractEmbedURLs(data string) []string {
candidates := make(map[string]struct{})
for _, match := range iframePattern.FindAllStringSubmatch(data, -1) {
if normalized, ok := normalizeCandidate(match[1]); ok {
candidates[normalized] = struct{}{}
}
}

urls := make([]string, 0, len(candidates))
for u := range candidates {
urls = append(urls, u)
}
sort.Strings(urls)
return urls
return extractURLs(iframePattern, data)
}

// ExtractScriptURLs returns URLs referenced by script tags.
func ExtractScriptURLs(data string) []string {
return extractURLs(scriptPattern, data)
}

// ParsePlaylist extracts stream URLs from playlist content.
func ParsePlaylist(content string, contentType string) []string {
candidates := make(map[string]struct{})
add := func(raw string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
if isStreamURL(raw) {
candidates[raw] = struct{}{}
}
}

lowerType := strings.ToLower(contentType)
lines := strings.Split(content, "\n")

if strings.Contains(lowerType, "xspf") || strings.Contains(strings.ToLower(content), "<location>") {
for _, match := range xspfPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}
}

for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}

for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
if strings.HasPrefix(strings.ToLower(line), "file") && strings.Contains(line, "=") {
parts := strings.SplitN(line, "=", 2)
add(parts[1])
continue
}
if strings.Contains(line, "http") {
for _, match := range urlPattern.FindAllStringSubmatch(line, -1) {
add(match[1])
}
}
}

if strings.Contains(lowerType, "json") {
for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}
}

streams := make([]string, 0, len(candidates))
for u := range candidates {
streams = append(streams, u)
}
sort.Strings(streams)
return streams
candidates := make(map[string]struct{})
add := func(raw string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
if isStreamURL(raw) {
candidates[raw] = struct{}{}
}
}
addForce := func(raw string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
candidates[raw] = struct{}{}
}

lowerType := strings.ToLower(contentType)
lines := strings.Split(content, "\n")

if strings.Contains(lowerType, "xspf") || strings.Contains(strings.ToLower(content), "<location>") {
for _, match := range xspfPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}
}

for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}

for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
if strings.HasPrefix(strings.ToLower(line), "file") && strings.Contains(line, "=") {
parts := strings.SplitN(line, "=", 2)
add(parts[1])
continue
}
if strings.Contains(line, "http") {
matched := false
for _, match := range urlPattern.FindAllStringSubmatch(line, -1) {
add(match[1])
matched = true
}
if !matched {
addForce(line)
}
}
}

if strings.Contains(lowerType, "json") {
for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}
}

streams := make([]string, 0, len(candidates))
for u := range candidates {
streams = append(streams, u)
}
sort.Strings(streams)
return streams
}
func extractURLs(pattern *regexp.Regexp, data string) []string {
candidates := make(map[string]struct{})
for _, match := range pattern.FindAllStringSubmatch(data, -1) {
if normalized, ok := normalizeCandidate(match[1]); ok {
candidates[normalized] = struct{}{}
}
}

urls := make([]string, 0, len(candidates))
for u := range candidates {
urls = append(urls, u)
}
sort.Strings(urls)
return urls
}

func normalizeCandidate(raw string) (string, bool) {
raw = strings.TrimSpace(raw)
if raw == "" {
return "", false
}
if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//")) {
return "", false
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
normalized := strings.TrimRight(raw, "+")
normalized = strings.ReplaceAll(normalized, `\\`, "")
if normalized == "" {
return "", false
}
return normalized, true
raw = strings.TrimSpace(raw)
if raw == "" {
return "", false
}
if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//") || strings.HasPrefix(raw, "/")) {
return "", false
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
normalized := strings.TrimRight(raw, "+")
normalized = strings.ReplaceAll(normalized, `\\`, "")
if normalized == "" {
return "", false
}
return normalized, true
}

func isStreamURL(u string) bool {
lower := strings.ToLower(u)
return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") ||
strings.Contains(lower, ".ogg") || strings.Contains(lower, ".opus")
lower := strings.ToLower(u)
return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") ||
strings.Contains(lower, ".ogg") || strings.Contains(lower, ".opus")
}

func isPlaylistURL(u string) bool {
lower := strings.ToLower(u)
return strings.Contains(lower, ".m3u") || strings.Contains(lower, ".pls") ||
strings.Contains(lower, ".xspf") || strings.Contains(lower, ".json")
lower := strings.ToLower(u)
return strings.Contains(lower, ".m3u") || strings.Contains(lower, ".pls") ||
strings.Contains(lower, ".xspf") || strings.Contains(lower, ".json")
}

+ 49
- 40
internal/extractor/extractor_test.go Visa fil

@@ -1,12 +1,12 @@
package extractor

import (
"reflect"
"testing"
"reflect"
"testing"
)

func TestExtractStreams(t *testing.T) {
html := `
html := `
<script>
var streamsrc = 'https://example.com/live/stream.mp3';
var streamhash="https://cdn.example.net/relay.m3u8";
@@ -19,56 +19,65 @@ func TestExtractStreams(t *testing.T) {
<div data-value="https://example.com/secret.pls"></div>
`

streams := ExtractStreams(html)
if len(streams) != 7 {
t.Fatalf("wanted 7 streams, got %d: %v", len(streams), streams)
}
found := false
for _, s := range streams {
if s == "https://stream.example.com/live" {
found = true
break
}
}
if !found {
t.Fatalf("expected audio tag stream to be present: %v", streams)
}
streams := ExtractStreams(html)
if len(streams) != 7 {
t.Fatalf("wanted 7 streams, got %d: %v", len(streams), streams)
}
found := false
for _, s := range streams {
if s == "https://stream.example.com/live" {
found = true
break
}
}
if !found {
t.Fatalf("expected audio tag stream to be present: %v", streams)
}
}

func TestExtractPlaylistLinks(t *testing.T) {
html := `
html := `
<a href="https://example.com/stream.m3u">m3u</a>
<a href="https://example.com/playlist.pls">pls</a>
<a href="https://example.com/radio.xspf">xspf</a>
<a href="https://example.com/data.json">json</a>
`
links := ExtractPlaylistLinks(html)
if len(links) != 4 {
t.Fatalf("wanted 4 playlist links, got %d: %v", len(links), links)
}
links := ExtractPlaylistLinks(html)
if len(links) != 4 {
t.Fatalf("wanted 4 playlist links, got %d: %v", len(links), links)
}
}

func TestExtractEmbedURLs(t *testing.T) {
html := `<iframe src="//example.com/embed"></iframe><iframe src="https://example.org/player"></iframe>`
urls := ExtractEmbedURLs(html)
want := []string{"https://example.com/embed", "https://example.org/player"}
if !reflect.DeepEqual(urls, want) {
t.Fatalf("wanted iframe URLs %v, got %v", want, urls)
}
html := `<iframe src="//example.com/embed"></iframe><iframe src="https://example.org/player"></iframe>`
urls := ExtractEmbedURLs(html)
want := []string{"https://example.com/embed", "https://example.org/player"}
if !reflect.DeepEqual(urls, want) {
t.Fatalf("wanted iframe URLs %v, got %v", want, urls)
}
}

func TestExtractScriptURLs(t *testing.T) {
html := `<script src="/js/app.js"></script><script src="https://example.org/player.js"></script>`
urls := ExtractScriptURLs(html)
want := []string{"/js/app.js", "https://example.org/player.js"}
if !reflect.DeepEqual(urls, want) {
t.Fatalf("wanted script URLs %v, got %v", want, urls)
}
}

func TestParsePlaylist(t *testing.T) {
m3u := "#EXTM3U\nhttps://example.com/live.mp3\n"
pls := "[playlist]\nFile1=https://example.com/stream.aac\n"
xspf := "<playlist><location>https://example.com/hls.m3u8</location></playlist>"
m3u := "#EXTM3U\nhttps://example.com/live.mp3\n"
pls := "[playlist]\nFile1=https://example.com/stream.aac\n"
xspf := "<playlist><location>https://example.com/hls.m3u8</location></playlist>"

if len(ParsePlaylist(m3u, "audio/x-mpegurl")) != 1 {
t.Fatal("expected m3u playlist to yield 1 stream")
}
if len(ParsePlaylist(pls, "audio/x-scpls")) != 1 {
t.Fatal("expected pls playlist to yield 1 stream")
}
if len(ParsePlaylist(xspf, "application/xspf+xml")) != 1 {
t.Fatal("expected xspf playlist to yield 1 stream")
}
if len(ParsePlaylist(m3u, "audio/x-mpegurl")) != 1 {
t.Fatal("expected m3u playlist to yield 1 stream")
}
if len(ParsePlaylist(pls, "audio/x-scpls")) != 1 {
t.Fatal("expected pls playlist to yield 1 stream")
}
if len(ParsePlaylist(xspf, "application/xspf+xml")) != 1 {
t.Fatal("expected xspf playlist to yield 1 stream")
}
}

Laddar…
Avbryt
Spara