Browse Source

chore: import radio-stream-extractor

master
Alfred 5 days ago
commit
fb70d6456c
6 changed files with 790 additions and 0 deletions
  1. +6
    -0
      .gitignore
  2. +64
    -0
      README.md
  3. +504
    -0
      cmd/radiostreamscan/main.go
  4. +3
    -0
      go.mod
  5. +162
    -0
      internal/extractor/extractor.go
  6. +51
    -0
      internal/extractor/extractor_test.go

+ 6
- 0
.gitignore View File

@@ -0,0 +1,6 @@
# Build artifacts
radiostreamscan.exe
radiostreamscan.zip

# Runtime logs and history
history.jsonl

+ 64
- 0
README.md View File

@@ -0,0 +1,64 @@
# radio-stream-extractor

`radiostreamscan` ist ein Go-Tool, das Radio-Streams aus Webseiten extrahiert. Es unterstützt:

- **Web-UI (Default)**: Start ohne Argumente öffnet einen Server auf `:8080` mit Formular + JSON-Endpoint.
- **CLI-Modus**: Übergibst du URLs als Argumente, werden Streams direkt ausgegeben.
- **Playlist-Auflösung**: m3u/m3u8/pls/xspf/json werden erkannt, geladen und in echte Stream-URLs aufgelöst.
- **Probing**: optionaler HEAD-Check für Stream-URLs (`-probe`).
- **Export**: `-format text|json|csv|pls`.
- **History-Log**: Ergebnisse werden als JSONL in `history.jsonl` gespeichert (abschaltbar mit `-history ""`).
- **Concurrency**: parallelisierte Fetches mit `-concurrency`.

## Web-Server-Modus (Standard)

```sh
go run ./cmd/radiostreamscan
```

Danach erreichst du die UI unter `http://localhost:8080/` (inkl. Ergebnis-Ansicht mit Copy-to-Clipboard). Der JSON-Endpoint ist:

```
/scan?url=https://example.com
```

Mehrere URLs:

```
/scan?url=a&url=b&url=c
```

## CLI-Modus

```sh
go run ./cmd/radiostreamscan https://live24.gr/radio/generic.jsp?sid=2676
```

## Flags (Auszug)

- `-format` (text|json|csv|pls)
- `-probe` (true/false)
- `-header "Key: Value"` (repeatable)
- `-proxy http://host:port`
- `-history history.jsonl` (leer = aus)
- `-watch 30s` (CLI wiederholt den Scan)
- `-concurrency 4`

## Build / EXE

```sh
go build -o radiostreamscan.exe ./cmd/radiostreamscan
```

Die EXE enthält Web-Server und CLI in einem Binary.

## Tests

```sh
go test ./...
```

## Projektstruktur

- `cmd/radiostreamscan`: Hauptentrypoint mit URL-Scan, Webserver und Exporten
- `internal/extractor`: Parser für Candidate-URLs + Playlist-Parser mit Unit-Tests

+ 504
- 0
cmd/radiostreamscan/main.go View File

@@ -0,0 +1,504 @@
package main

import (
"bufio"
"encoding/csv"
"encoding/json"
"flag"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
"sync"
"time"

"radio-stream-extractor/internal/extractor"
)

type scanResult struct {
URL string `json:"url"`
Streams []string `json:"streams"`
Playlists []string `json:"playlists,omitempty"`
Probes []probeResult `json:"probes,omitempty"`
Error string `json:"error,omitempty"`
FetchedAt time.Time `json:"fetchedAt"`
FromPlaylist bool `json:"fromPlaylist"`
}

type probeResult struct {
URL string `json:"url"`
Status string `json:"status"`
ContentType string `json:"contentType,omitempty"`
}

type config struct {
Format string
Probe bool
Headers headerList
Proxy string
HistoryPath string
Watch time.Duration
Concurrency int
}

type headerList []string

func (h *headerList) String() string { return strings.Join(*h, ", ") }
func (h *headerList) Set(v string) error {
*h = append(*h, v)
return nil
}

func main() {
port := flag.String("port", ":8080", "listen address for the web server (default :8080)")
web := flag.Bool("web", false, "force web-server mode even when URLs are provided")

cfg := config{}
flag.StringVar(&cfg.Format, "format", "text", "output format: text|json|csv|pls")
flag.BoolVar(&cfg.Probe, "probe", true, "probe discovered stream URLs with HTTP HEAD")
flag.Var(&cfg.Headers, "header", "custom HTTP header (repeatable), e.g. -header 'Referer: https://example.com'")
flag.StringVar(&cfg.Proxy, "proxy", "", "HTTP proxy URL (optional)")
flag.StringVar(&cfg.HistoryPath, "history", "history.jsonl", "path to JSONL history log (empty to disable)")
flag.DurationVar(&cfg.Watch, "watch", 0, "repeat scan in CLI mode at interval (e.g. 30s, 2m)")
flag.IntVar(&cfg.Concurrency, "concurrency", 4, "number of concurrent fetch workers")

flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] <url> [url...]\n", os.Args[0])
flag.PrintDefaults()
}
flag.Parse()

urls := flag.Args()
client := newHTTPClient(cfg.Proxy)
history := newHistoryWriter(cfg.HistoryPath)

if *web || len(urls) == 0 {
if err := runWebMode(*port, client, &cfg, history); err != nil {
fmt.Fprintf(os.Stderr, "web mode failed: %v\n", err)
os.Exit(1)
}
return
}

runCLIMode(urls, client, &cfg, history)
}

func runCLIMode(urls []string, client *http.Client, cfg *config, history *historyWriter) {
for {
results := scanURLs(urls, client, cfg)
outputResults(results, cfg.Format, os.Stdout)
history.Write(results)
if cfg.Watch == 0 {
return
}
time.Sleep(cfg.Watch)
}
}

func runWebMode(addr string, client *http.Client, cfg *config, history *historyWriter) error {
mux := http.NewServeMux()
mux.HandleFunc("/", indexHandler)
mux.HandleFunc("/scan", makeScanHandler(client, cfg, history))
mux.HandleFunc("/watch", watchHandler)

fmt.Printf("radiostreamscan listening on %s (GET /scan?url=... or POST url=...)\n", addr)
return http.ListenAndServe(addr, mux)
}

func indexHandler(w http.ResponseWriter, r *http.Request) {
fmt.Fprintf(w, `<!doctype html>
<html>
<head><meta charset="utf-8"><title>radiostreamscan</title></head>
<body>
<h1>radiostreamscan</h1>
<form method="get" action="/watch">
<label>Stream-URLs (eine pro Zeile)</label><br/>
<textarea name="url" rows="6" cols="80" required></textarea><br/>
<label>Format
<select name="format">
<option value="json">json</option>
<option value="text">text</option>
<option value="csv">csv</option>
<option value="pls">pls</option>
</select>
</label>
<label>Auto-Refresh (Sekunden)
<input type="number" name="interval" value="0" min="0" />
</label>
<label><input type="checkbox" name="probe" value="1" checked> Probing</label>
<button type="submit">Scan</button>
</form>
<p>Mehrere URLs: /scan?url=a&url=b&url=c</p>
</body>
</html>`)
}

func watchHandler(w http.ResponseWriter, r *http.Request) {
urls := normalizeURLInputs(r.URL.Query()["url"])
interval := r.URL.Query().Get("interval")
probe := r.URL.Query().Get("probe")
fmt.Fprintf(w, `<!doctype html>
<html>
<head><meta charset="utf-8"><title>radiostreamscan results</title>
<style>
body { font-family: Arial, sans-serif; }
.url-block { margin: 10px 0; padding: 10px; border: 1px solid #ccc; }
.error { color: #b00020; }
button { margin: 8px 0; }
</style>
</head>
<body>
<h1>radiostreamscan results</h1>
<button id="copy">Alle Streams kopieren</button>
<div id="output">Loading...</div>
<textarea id="clipboard" style="position:absolute; left:-9999px; top:-9999px;"></textarea>
<script>
const urls = %q.split("\n").filter(Boolean);
const interval = %q;
const probe = %q;

async function fetchData() {
const params = new URLSearchParams();
urls.forEach(u => params.append("url", u));
params.set("format", "json");
if (probe) params.set("probe", "1");

const res = await fetch("/scan?" + params.toString());
const data = await res.json();

const container = document.getElementById("output");
container.innerHTML = "";

const allStreams = [];

data.forEach(item => {
const block = document.createElement("div");
block.className = "url-block";
const title = document.createElement("h3");
title.textContent = item.url;
block.appendChild(title);

if (item.error) {
const err = document.createElement("div");
err.className = "error";
err.textContent = item.error;
block.appendChild(err);
container.appendChild(block);
return;
}

const list = document.createElement("ul");
(item.streams || []).forEach(s => {
const li = document.createElement("li");
li.textContent = s;
list.appendChild(li);
allStreams.push(s);
});
block.appendChild(list);
container.appendChild(block);
});

document.getElementById("clipboard").value = allStreams.join("\n");
}

document.getElementById("copy").addEventListener("click", () => {
const text = document.getElementById("clipboard").value;
if (navigator.clipboard && navigator.clipboard.writeText) {
navigator.clipboard.writeText(text);
} else {
const el = document.getElementById("clipboard");
el.select();
document.execCommand("copy");
}
});

fetchData();
if (interval && Number(interval) > 0) {
setInterval(fetchData, Number(interval) * 1000);
}
</script>
</body>
</html>`, strings.Join(urls, "\n"), interval, probe)
}

func makeScanHandler(client *http.Client, cfg *config, history *historyWriter) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var urls []string
switch r.Method {
case http.MethodGet:
urls = r.URL.Query()["url"]
case http.MethodPost:
if err := r.ParseForm(); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
urls = r.Form["url"]
default:
http.Error(w, "only GET and POST supported", http.StatusMethodNotAllowed)
return
}

urls = normalizeURLInputs(urls)
if len(urls) == 0 {
http.Error(w, "provide at least one url parameter", http.StatusBadRequest)
return
}

localCfg := *cfg
if r.URL.Query().Get("probe") == "1" {
localCfg.Probe = true
} else if r.URL.Query().Get("probe") == "0" {
localCfg.Probe = false
}
if f := r.URL.Query().Get("format"); f != "" {
localCfg.Format = f
}

results := scanURLs(urls, client, &localCfg)
history.Write(results)
outputResults(results, localCfg.Format, w)
}
}

func normalizeURLInputs(inputs []string) []string {
var urls []string
for _, item := range inputs {
for _, line := range strings.Split(item, "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
urls = append(urls, line)
}
}
return urls
}

func scanURLs(urls []string, client *http.Client, cfg *config) []scanResult {
results := make([]scanResult, len(urls))
type job struct {
index int
url string
}
jobs := make(chan job)
var wg sync.WaitGroup

workers := cfg.Concurrency
if workers < 1 {
workers = 1
}

for i := 0; i < workers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := range jobs {
res := scanOneURL(client, cfg, j.url)
results[j.index] = res
}
}()
}

for i, u := range urls {
jobs <- job{index: i, url: u}
}
close(jobs)
wg.Wait()
return results
}

func scanOneURL(client *http.Client, cfg *config, raw string) scanResult {
res := scanResult{URL: raw, FetchedAt: time.Now()}
html, contentType, err := fetchContent(client, cfg, raw)
if err != nil {
res.Error = err.Error()
return res
}

streams := extractor.ExtractStreams(html)
playlists := extractor.ExtractPlaylistLinks(html)
res.Playlists = playlists

for _, pl := range playlists {
plContent, plType, err := fetchContent(client, cfg, pl)
if err != nil {
continue
}
parsed := extractor.ParsePlaylist(plContent, plType)
if len(parsed) > 0 {
streams = append(streams, parsed...)
res.FromPlaylist = true
}
}

res.Streams = uniqueStrings(streams)

if cfg.Probe {
res.Probes = probeStreams(client, cfg, res.Streams)
}

_ = contentType
return res
}

func fetchContent(client *http.Client, cfg *config, raw string) (string, string, error) {
req, err := http.NewRequest(http.MethodGet, raw, nil)
if err != nil {
return "", "", err
}
req.Header.Set("User-Agent", "radiostreamscan/0.2")
for _, h := range cfg.Headers {
parts := strings.SplitN(h, ":", 2)
if len(parts) == 2 {
req.Header.Set(strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]))
}
}

resp, err := client.Do(req)
if err != nil {
return "", "", err
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return "", "", fmt.Errorf("unexpected status %s", resp.Status)
}

body, err := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
if err != nil {
return "", "", err
}

return string(body), resp.Header.Get("Content-Type"), nil
}

func probeStreams(client *http.Client, cfg *config, streams []string) []probeResult {
var results []probeResult
for _, s := range streams {
req, err := http.NewRequest(http.MethodHead, s, nil)
if err != nil {
continue
}
for _, h := range cfg.Headers {
parts := strings.SplitN(h, ":", 2)
if len(parts) == 2 {
req.Header.Set(strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]))
}
}
resp, err := client.Do(req)
if err != nil {
results = append(results, probeResult{URL: s, Status: err.Error()})
continue
}
resp.Body.Close()
results = append(results, probeResult{URL: s, Status: resp.Status, ContentType: resp.Header.Get("Content-Type")})
}
return results
}

func outputResults(results []scanResult, format string, w io.Writer) {
if rw, ok := w.(http.ResponseWriter); ok {
if strings.ToLower(format) == "json" {
rw.Header().Set("Content-Type", "application/json")
} else if strings.ToLower(format) == "csv" {
rw.Header().Set("Content-Type", "text/csv")
}
}

switch strings.ToLower(format) {
case "json":
json.NewEncoder(w).Encode(results)
case "csv":
cw := csv.NewWriter(w)
cw.Write([]string{"input_url", "stream_url"})
for _, res := range results {
for _, s := range res.Streams {
cw.Write([]string{res.URL, s})
}
}
cw.Flush()
case "pls":
fmt.Fprintln(w, "[playlist]")
i := 1
for _, res := range results {
for _, s := range res.Streams {
fmt.Fprintf(w, "File%d=%s\n", i, s)
i++
}
}
fmt.Fprintf(w, "NumberOfEntries=%d\nVersion=2\n", i-1)
default:
for _, res := range results {
fmt.Fprintf(w, "URL: %s\n", res.URL)
if res.Error != "" {
fmt.Fprintf(w, " error: %s\n", res.Error)
continue
}
if len(res.Streams) == 0 {
fmt.Fprintln(w, " (no candidate streams found)")
continue
}
for _, s := range res.Streams {
fmt.Fprintf(w, " - %s\n", s)
}
}
}
}

func newHTTPClient(proxyURL string) *http.Client {
transport := &http.Transport{}
if proxyURL != "" {
if parsed, err := url.Parse(proxyURL); err == nil {
transport.Proxy = http.ProxyURL(parsed)
}
}
return &http.Client{Timeout: 15 * time.Second, Transport: transport}
}

func uniqueStrings(values []string) []string {
set := make(map[string]struct{})
for _, v := range values {
set[v] = struct{}{}
}
out := make([]string, 0, len(set))
for v := range set {
out = append(out, v)
}
return out
}

type historyWriter struct {
path string
mu sync.Mutex
}

func newHistoryWriter(path string) *historyWriter {
return &historyWriter{path: path}
}

func (h *historyWriter) Write(results []scanResult) {
if h == nil || h.path == "" {
return
}
h.mu.Lock()
defer h.mu.Unlock()

f, err := os.OpenFile(h.path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
return
}
defer f.Close()

writer := bufio.NewWriter(f)
for _, res := range results {
data, err := json.Marshal(res)
if err != nil {
continue
}
writer.Write(data)
writer.WriteString("\n")
}
writer.Flush()
}

+ 3
- 0
go.mod View File

@@ -0,0 +1,3 @@
module radio-stream-extractor

go 1.21

+ 162
- 0
internal/extractor/extractor.go View File

@@ -0,0 +1,162 @@
package extractor

import (
"regexp"
"sort"
"strings"
)

var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json))`)
var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`)
var srcPattern = regexp.MustCompile(`(?i)src\s*=\s*['"]([^'"]+)['"]`)
var xspfPattern = regexp.MustCompile(`(?i)<location>([^<]+)</location>`)

// ExtractStreams returns the unique streaming URLs found in the provided HTML/text.
func ExtractStreams(data string) []string {
candidates := make(map[string]struct{})
add := func(raw string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return
}
if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//")) {
return
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
normalized := strings.TrimRight(raw, "+")
normalized = strings.ReplaceAll(normalized, `\\`, "")
candidates[normalized] = struct{}{}
}

for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
add(match[2])
}
for _, match := range srcPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}

streams := make([]string, 0, len(candidates))
for u := range candidates {
if isStreamURL(u) {
streams = append(streams, u)
}
}
sort.Strings(streams)
return streams
}

// ExtractPlaylistLinks returns URLs likely pointing to playlists (m3u/pls/xspf/json).
func ExtractPlaylistLinks(data string) []string {
candidates := make(map[string]struct{})
add := func(raw string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return
}
if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//")) {
return
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
normalized := strings.TrimRight(raw, "+")
normalized = strings.ReplaceAll(normalized, `\\`, "")
if isPlaylistURL(normalized) {
candidates[normalized] = struct{}{}
}
}

for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
add(match[2])
}
for _, match := range srcPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}

links := make([]string, 0, len(candidates))
for u := range candidates {
links = append(links, u)
}
sort.Strings(links)
return links
}

// ParsePlaylist extracts stream URLs from playlist content.
func ParsePlaylist(content string, contentType string) []string {
candidates := make(map[string]struct{})
add := func(raw string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
if isStreamURL(raw) {
candidates[raw] = struct{}{}
}
}

lowerType := strings.ToLower(contentType)
lines := strings.Split(content, "\n")

if strings.Contains(lowerType, "xspf") || strings.Contains(strings.ToLower(content), "<location>") {
for _, match := range xspfPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}
}

for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}

for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
if strings.HasPrefix(strings.ToLower(line), "file") && strings.Contains(line, "=") {
parts := strings.SplitN(line, "=", 2)
add(parts[1])
continue
}
if strings.Contains(line, "http") {
for _, match := range urlPattern.FindAllStringSubmatch(line, -1) {
add(match[1])
}
}
}

if strings.Contains(lowerType, "json") {
for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}
}

streams := make([]string, 0, len(candidates))
for u := range candidates {
streams = append(streams, u)
}
sort.Strings(streams)
return streams
}

func isStreamURL(u string) bool {
lower := strings.ToLower(u)
return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") ||
strings.Contains(lower, ".ogg") || strings.Contains(lower, ".opus")
}

func isPlaylistURL(u string) bool {
lower := strings.ToLower(u)
return strings.Contains(lower, ".m3u") || strings.Contains(lower, ".pls") ||
strings.Contains(lower, ".xspf") || strings.Contains(lower, ".json")
}

+ 51
- 0
internal/extractor/extractor_test.go View File

@@ -0,0 +1,51 @@
package extractor

import "testing"

func TestExtractStreams(t *testing.T) {
html := `
<script>
var streamsrc = 'https://example.com/live/stream.mp3';
var streamhash="https://cdn.example.net/relay.m3u8";
var audioUrl="https://example.com/audio.ogg";
</script>
<a href="https://streams.example.org/radio.aac?user=test">listen</a>
<source src="//players.example.eu/ambient.ogg" type="audio/ogg" />
<audio data-src="https://pod.example.com/episode.opus"></audio>
<div data-value="https://example.com/secret.pls"></div>
`

streams := ExtractStreams(html)
if len(streams) != 6 {
t.Fatalf("wanted 6 streams, got %d: %v", len(streams), streams)
}
}

func TestExtractPlaylistLinks(t *testing.T) {
html := `
<a href="https://example.com/stream.m3u">m3u</a>
<a href="https://example.com/playlist.pls">pls</a>
<a href="https://example.com/radio.xspf">xspf</a>
<a href="https://example.com/data.json">json</a>
`
links := ExtractPlaylistLinks(html)
if len(links) != 4 {
t.Fatalf("wanted 4 playlist links, got %d: %v", len(links), links)
}
}

func TestParsePlaylist(t *testing.T) {
m3u := "#EXTM3U\nhttps://example.com/live.mp3\n"
pls := "[playlist]\nFile1=https://example.com/stream.aac\n"
xspf := "<playlist><location>https://example.com/hls.m3u8</location></playlist>"

if len(ParsePlaylist(m3u, "audio/x-mpegurl")) != 1 {
t.Fatal("expected m3u playlist to yield 1 stream")
}
if len(ParsePlaylist(pls, "audio/x-scpls")) != 1 {
t.Fatal("expected pls playlist to yield 1 stream")
}
if len(ParsePlaylist(xspf, "application/xspf+xml")) != 1 {
t.Fatal("expected xspf playlist to yield 1 stream")
}
}

Loading…
Cancel
Save