commit fb70d6456cfb725deea883fa95a8d759beeff684 Author: Alfred Date: Mon Mar 16 07:13:24 2026 +0100 chore: import radio-stream-extractor diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b9a7939 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Build artifacts +radiostreamscan.exe +radiostreamscan.zip + +# Runtime logs and history +history.jsonl diff --git a/README.md b/README.md new file mode 100644 index 0000000..c854d78 --- /dev/null +++ b/README.md @@ -0,0 +1,64 @@ +# radio-stream-extractor + +`radiostreamscan` ist ein Go-Tool, das Radio-Streams aus Webseiten extrahiert. Es unterstützt: + +- **Web-UI (Default)**: Start ohne Argumente öffnet einen Server auf `:8080` mit Formular + JSON-Endpoint. +- **CLI-Modus**: Übergibst du URLs als Argumente, werden Streams direkt ausgegeben. +- **Playlist-Auflösung**: m3u/m3u8/pls/xspf/json werden erkannt, geladen und in echte Stream-URLs aufgelöst. +- **Probing**: optionaler HEAD-Check für Stream-URLs (`-probe`). +- **Export**: `-format text|json|csv|pls`. +- **History-Log**: Ergebnisse werden als JSONL in `history.jsonl` gespeichert (abschaltbar mit `-history ""`). +- **Concurrency**: parallelisierte Fetches mit `-concurrency`. + +## Web-Server-Modus (Standard) + +```sh +go run ./cmd/radiostreamscan +``` + +Danach erreichst du die UI unter `http://localhost:8080/` (inkl. Ergebnis-Ansicht mit Copy-to-Clipboard). Der JSON-Endpoint ist: + +``` +/scan?url=https://example.com +``` + +Mehrere URLs: + +``` +/scan?url=a&url=b&url=c +``` + +## CLI-Modus + +```sh +go run ./cmd/radiostreamscan https://live24.gr/radio/generic.jsp?sid=2676 +``` + +## Flags (Auszug) + +- `-format` (text|json|csv|pls) +- `-probe` (true/false) +- `-header "Key: Value"` (repeatable) +- `-proxy http://host:port` +- `-history history.jsonl` (leer = aus) +- `-watch 30s` (CLI wiederholt den Scan) +- `-concurrency 4` + +## Build / EXE + +```sh +go build -o radiostreamscan.exe ./cmd/radiostreamscan +``` + +Die EXE enthält Web-Server und CLI in einem Binary. + +## Tests + +```sh +go test ./... +``` + +## Projektstruktur + +- `cmd/radiostreamscan`: Hauptentrypoint mit URL-Scan, Webserver und Exporten +- `internal/extractor`: Parser für Candidate-URLs + Playlist-Parser mit Unit-Tests diff --git a/cmd/radiostreamscan/main.go b/cmd/radiostreamscan/main.go new file mode 100644 index 0000000..dd87f7a --- /dev/null +++ b/cmd/radiostreamscan/main.go @@ -0,0 +1,504 @@ +package main + +import ( + "bufio" + "encoding/csv" + "encoding/json" + "flag" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "sync" + "time" + + "radio-stream-extractor/internal/extractor" +) + +type scanResult struct { + URL string `json:"url"` + Streams []string `json:"streams"` + Playlists []string `json:"playlists,omitempty"` + Probes []probeResult `json:"probes,omitempty"` + Error string `json:"error,omitempty"` + FetchedAt time.Time `json:"fetchedAt"` + FromPlaylist bool `json:"fromPlaylist"` +} + +type probeResult struct { + URL string `json:"url"` + Status string `json:"status"` + ContentType string `json:"contentType,omitempty"` +} + +type config struct { + Format string + Probe bool + Headers headerList + Proxy string + HistoryPath string + Watch time.Duration + Concurrency int +} + +type headerList []string + +func (h *headerList) String() string { return strings.Join(*h, ", ") } +func (h *headerList) Set(v string) error { + *h = append(*h, v) + return nil +} + +func main() { + port := flag.String("port", ":8080", "listen address for the web server (default :8080)") + web := flag.Bool("web", false, "force web-server mode even when URLs are provided") + + cfg := config{} + flag.StringVar(&cfg.Format, "format", "text", "output format: text|json|csv|pls") + flag.BoolVar(&cfg.Probe, "probe", true, "probe discovered stream URLs with HTTP HEAD") + flag.Var(&cfg.Headers, "header", "custom HTTP header (repeatable), e.g. -header 'Referer: https://example.com'") + flag.StringVar(&cfg.Proxy, "proxy", "", "HTTP proxy URL (optional)") + flag.StringVar(&cfg.HistoryPath, "history", "history.jsonl", "path to JSONL history log (empty to disable)") + flag.DurationVar(&cfg.Watch, "watch", 0, "repeat scan in CLI mode at interval (e.g. 30s, 2m)") + flag.IntVar(&cfg.Concurrency, "concurrency", 4, "number of concurrent fetch workers") + + flag.Usage = func() { + fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] [url...]\n", os.Args[0]) + flag.PrintDefaults() + } + flag.Parse() + + urls := flag.Args() + client := newHTTPClient(cfg.Proxy) + history := newHistoryWriter(cfg.HistoryPath) + + if *web || len(urls) == 0 { + if err := runWebMode(*port, client, &cfg, history); err != nil { + fmt.Fprintf(os.Stderr, "web mode failed: %v\n", err) + os.Exit(1) + } + return + } + + runCLIMode(urls, client, &cfg, history) +} + +func runCLIMode(urls []string, client *http.Client, cfg *config, history *historyWriter) { + for { + results := scanURLs(urls, client, cfg) + outputResults(results, cfg.Format, os.Stdout) + history.Write(results) + if cfg.Watch == 0 { + return + } + time.Sleep(cfg.Watch) + } +} + +func runWebMode(addr string, client *http.Client, cfg *config, history *historyWriter) error { + mux := http.NewServeMux() + mux.HandleFunc("/", indexHandler) + mux.HandleFunc("/scan", makeScanHandler(client, cfg, history)) + mux.HandleFunc("/watch", watchHandler) + + fmt.Printf("radiostreamscan listening on %s (GET /scan?url=... or POST url=...)\n", addr) + return http.ListenAndServe(addr, mux) +} + +func indexHandler(w http.ResponseWriter, r *http.Request) { + fmt.Fprintf(w, ` + + radiostreamscan + +

radiostreamscan

+
+
+
+ + + + +
+

Mehrere URLs: /scan?url=a&url=b&url=c

+ +`) +} + +func watchHandler(w http.ResponseWriter, r *http.Request) { + urls := normalizeURLInputs(r.URL.Query()["url"]) + interval := r.URL.Query().Get("interval") + probe := r.URL.Query().Get("probe") + fmt.Fprintf(w, ` + +radiostreamscan results + + + +

radiostreamscan results

+ +
Loading...
+ + + +`, strings.Join(urls, "\n"), interval, probe) +} + +func makeScanHandler(client *http.Client, cfg *config, history *historyWriter) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var urls []string + switch r.Method { + case http.MethodGet: + urls = r.URL.Query()["url"] + case http.MethodPost: + if err := r.ParseForm(); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + urls = r.Form["url"] + default: + http.Error(w, "only GET and POST supported", http.StatusMethodNotAllowed) + return + } + + urls = normalizeURLInputs(urls) + if len(urls) == 0 { + http.Error(w, "provide at least one url parameter", http.StatusBadRequest) + return + } + + localCfg := *cfg + if r.URL.Query().Get("probe") == "1" { + localCfg.Probe = true + } else if r.URL.Query().Get("probe") == "0" { + localCfg.Probe = false + } + if f := r.URL.Query().Get("format"); f != "" { + localCfg.Format = f + } + + results := scanURLs(urls, client, &localCfg) + history.Write(results) + outputResults(results, localCfg.Format, w) + } +} + +func normalizeURLInputs(inputs []string) []string { + var urls []string + for _, item := range inputs { + for _, line := range strings.Split(item, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + urls = append(urls, line) + } + } + return urls +} + +func scanURLs(urls []string, client *http.Client, cfg *config) []scanResult { + results := make([]scanResult, len(urls)) + type job struct { + index int + url string + } + jobs := make(chan job) + var wg sync.WaitGroup + + workers := cfg.Concurrency + if workers < 1 { + workers = 1 + } + + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := range jobs { + res := scanOneURL(client, cfg, j.url) + results[j.index] = res + } + }() + } + + for i, u := range urls { + jobs <- job{index: i, url: u} + } + close(jobs) + wg.Wait() + return results +} + +func scanOneURL(client *http.Client, cfg *config, raw string) scanResult { + res := scanResult{URL: raw, FetchedAt: time.Now()} + html, contentType, err := fetchContent(client, cfg, raw) + if err != nil { + res.Error = err.Error() + return res + } + + streams := extractor.ExtractStreams(html) + playlists := extractor.ExtractPlaylistLinks(html) + res.Playlists = playlists + + for _, pl := range playlists { + plContent, plType, err := fetchContent(client, cfg, pl) + if err != nil { + continue + } + parsed := extractor.ParsePlaylist(plContent, plType) + if len(parsed) > 0 { + streams = append(streams, parsed...) + res.FromPlaylist = true + } + } + + res.Streams = uniqueStrings(streams) + + if cfg.Probe { + res.Probes = probeStreams(client, cfg, res.Streams) + } + + _ = contentType + return res +} + +func fetchContent(client *http.Client, cfg *config, raw string) (string, string, error) { + req, err := http.NewRequest(http.MethodGet, raw, nil) + if err != nil { + return "", "", err + } + req.Header.Set("User-Agent", "radiostreamscan/0.2") + for _, h := range cfg.Headers { + parts := strings.SplitN(h, ":", 2) + if len(parts) == 2 { + req.Header.Set(strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1])) + } + } + + resp, err := client.Do(req) + if err != nil { + return "", "", err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return "", "", fmt.Errorf("unexpected status %s", resp.Status) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 2<<20)) + if err != nil { + return "", "", err + } + + return string(body), resp.Header.Get("Content-Type"), nil +} + +func probeStreams(client *http.Client, cfg *config, streams []string) []probeResult { + var results []probeResult + for _, s := range streams { + req, err := http.NewRequest(http.MethodHead, s, nil) + if err != nil { + continue + } + for _, h := range cfg.Headers { + parts := strings.SplitN(h, ":", 2) + if len(parts) == 2 { + req.Header.Set(strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1])) + } + } + resp, err := client.Do(req) + if err != nil { + results = append(results, probeResult{URL: s, Status: err.Error()}) + continue + } + resp.Body.Close() + results = append(results, probeResult{URL: s, Status: resp.Status, ContentType: resp.Header.Get("Content-Type")}) + } + return results +} + +func outputResults(results []scanResult, format string, w io.Writer) { + if rw, ok := w.(http.ResponseWriter); ok { + if strings.ToLower(format) == "json" { + rw.Header().Set("Content-Type", "application/json") + } else if strings.ToLower(format) == "csv" { + rw.Header().Set("Content-Type", "text/csv") + } + } + + switch strings.ToLower(format) { + case "json": + json.NewEncoder(w).Encode(results) + case "csv": + cw := csv.NewWriter(w) + cw.Write([]string{"input_url", "stream_url"}) + for _, res := range results { + for _, s := range res.Streams { + cw.Write([]string{res.URL, s}) + } + } + cw.Flush() + case "pls": + fmt.Fprintln(w, "[playlist]") + i := 1 + for _, res := range results { + for _, s := range res.Streams { + fmt.Fprintf(w, "File%d=%s\n", i, s) + i++ + } + } + fmt.Fprintf(w, "NumberOfEntries=%d\nVersion=2\n", i-1) + default: + for _, res := range results { + fmt.Fprintf(w, "URL: %s\n", res.URL) + if res.Error != "" { + fmt.Fprintf(w, " error: %s\n", res.Error) + continue + } + if len(res.Streams) == 0 { + fmt.Fprintln(w, " (no candidate streams found)") + continue + } + for _, s := range res.Streams { + fmt.Fprintf(w, " - %s\n", s) + } + } + } +} + +func newHTTPClient(proxyURL string) *http.Client { + transport := &http.Transport{} + if proxyURL != "" { + if parsed, err := url.Parse(proxyURL); err == nil { + transport.Proxy = http.ProxyURL(parsed) + } + } + return &http.Client{Timeout: 15 * time.Second, Transport: transport} +} + +func uniqueStrings(values []string) []string { + set := make(map[string]struct{}) + for _, v := range values { + set[v] = struct{}{} + } + out := make([]string, 0, len(set)) + for v := range set { + out = append(out, v) + } + return out +} + +type historyWriter struct { + path string + mu sync.Mutex +} + +func newHistoryWriter(path string) *historyWriter { + return &historyWriter{path: path} +} + +func (h *historyWriter) Write(results []scanResult) { + if h == nil || h.path == "" { + return + } + h.mu.Lock() + defer h.mu.Unlock() + + f, err := os.OpenFile(h.path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) + if err != nil { + return + } + defer f.Close() + + writer := bufio.NewWriter(f) + for _, res := range results { + data, err := json.Marshal(res) + if err != nil { + continue + } + writer.Write(data) + writer.WriteString("\n") + } + writer.Flush() +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..1713d8c --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module radio-stream-extractor + +go 1.21 diff --git a/internal/extractor/extractor.go b/internal/extractor/extractor.go new file mode 100644 index 0000000..94023eb --- /dev/null +++ b/internal/extractor/extractor.go @@ -0,0 +1,162 @@ +package extractor + +import ( + "regexp" + "sort" + "strings" +) + +var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json))`) +var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`) +var srcPattern = regexp.MustCompile(`(?i)src\s*=\s*['"]([^'"]+)['"]`) +var xspfPattern = regexp.MustCompile(`(?i)([^<]+)`) + +// ExtractStreams returns the unique streaming URLs found in the provided HTML/text. +func ExtractStreams(data string) []string { + candidates := make(map[string]struct{}) + add := func(raw string) { + raw = strings.TrimSpace(raw) + if raw == "" { + return + } + if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//")) { + return + } + if strings.HasPrefix(raw, "//") { + raw = "https:" + raw + } + normalized := strings.TrimRight(raw, "+") + normalized = strings.ReplaceAll(normalized, `\\`, "") + candidates[normalized] = struct{}{} + } + + for _, match := range urlPattern.FindAllStringSubmatch(data, -1) { + add(match[1]) + } + for _, match := range attrPattern.FindAllStringSubmatch(data, -1) { + add(match[2]) + } + for _, match := range srcPattern.FindAllStringSubmatch(data, -1) { + add(match[1]) + } + + streams := make([]string, 0, len(candidates)) + for u := range candidates { + if isStreamURL(u) { + streams = append(streams, u) + } + } + sort.Strings(streams) + return streams +} + +// ExtractPlaylistLinks returns URLs likely pointing to playlists (m3u/pls/xspf/json). +func ExtractPlaylistLinks(data string) []string { + candidates := make(map[string]struct{}) + add := func(raw string) { + raw = strings.TrimSpace(raw) + if raw == "" { + return + } + if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//")) { + return + } + if strings.HasPrefix(raw, "//") { + raw = "https:" + raw + } + normalized := strings.TrimRight(raw, "+") + normalized = strings.ReplaceAll(normalized, `\\`, "") + if isPlaylistURL(normalized) { + candidates[normalized] = struct{}{} + } + } + + for _, match := range urlPattern.FindAllStringSubmatch(data, -1) { + add(match[1]) + } + for _, match := range attrPattern.FindAllStringSubmatch(data, -1) { + add(match[2]) + } + for _, match := range srcPattern.FindAllStringSubmatch(data, -1) { + add(match[1]) + } + + links := make([]string, 0, len(candidates)) + for u := range candidates { + links = append(links, u) + } + sort.Strings(links) + return links +} + +// ParsePlaylist extracts stream URLs from playlist content. +func ParsePlaylist(content string, contentType string) []string { + candidates := make(map[string]struct{}) + add := func(raw string) { + raw = strings.TrimSpace(raw) + if raw == "" { + return + } + if strings.HasPrefix(raw, "//") { + raw = "https:" + raw + } + if isStreamURL(raw) { + candidates[raw] = struct{}{} + } + } + + lowerType := strings.ToLower(contentType) + lines := strings.Split(content, "\n") + + if strings.Contains(lowerType, "xspf") || strings.Contains(strings.ToLower(content), "") { + for _, match := range xspfPattern.FindAllStringSubmatch(content, -1) { + add(match[1]) + } + } + + for _, match := range urlPattern.FindAllStringSubmatch(content, -1) { + add(match[1]) + } + + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + if strings.HasPrefix(strings.ToLower(line), "file") && strings.Contains(line, "=") { + parts := strings.SplitN(line, "=", 2) + add(parts[1]) + continue + } + if strings.Contains(line, "http") { + for _, match := range urlPattern.FindAllStringSubmatch(line, -1) { + add(match[1]) + } + } + } + + if strings.Contains(lowerType, "json") { + for _, match := range urlPattern.FindAllStringSubmatch(content, -1) { + add(match[1]) + } + } + + streams := make([]string, 0, len(candidates)) + for u := range candidates { + streams = append(streams, u) + } + sort.Strings(streams) + return streams +} + +func isStreamURL(u string) bool { + lower := strings.ToLower(u) + return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") || + strings.Contains(lower, ".ogg") || strings.Contains(lower, ".opus") +} + +func isPlaylistURL(u string) bool { + lower := strings.ToLower(u) + return strings.Contains(lower, ".m3u") || strings.Contains(lower, ".pls") || + strings.Contains(lower, ".xspf") || strings.Contains(lower, ".json") +} diff --git a/internal/extractor/extractor_test.go b/internal/extractor/extractor_test.go new file mode 100644 index 0000000..c1747fd --- /dev/null +++ b/internal/extractor/extractor_test.go @@ -0,0 +1,51 @@ +package extractor + +import "testing" + +func TestExtractStreams(t *testing.T) { + html := ` + + listen + + +
+ ` + + streams := ExtractStreams(html) + if len(streams) != 6 { + t.Fatalf("wanted 6 streams, got %d: %v", len(streams), streams) + } +} + +func TestExtractPlaylistLinks(t *testing.T) { + html := ` + m3u + pls + xspf + json + ` + links := ExtractPlaylistLinks(html) + if len(links) != 4 { + t.Fatalf("wanted 4 playlist links, got %d: %v", len(links), links) + } +} + +func TestParsePlaylist(t *testing.T) { + m3u := "#EXTM3U\nhttps://example.com/live.mp3\n" + pls := "[playlist]\nFile1=https://example.com/stream.aac\n" + xspf := "https://example.com/hls.m3u8" + + if len(ParsePlaylist(m3u, "audio/x-mpegurl")) != 1 { + t.Fatal("expected m3u playlist to yield 1 stream") + } + if len(ParsePlaylist(pls, "audio/x-scpls")) != 1 { + t.Fatal("expected pls playlist to yield 1 stream") + } + if len(ParsePlaylist(xspf, "application/xspf+xml")) != 1 { + t.Fatal("expected xspf playlist to yield 1 stream") + } +}