package extractor import ( "io" "net/url" "regexp" "sort" "strings" "golang.org/x/net/html" ) var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json)(?:\?[^\s"'<>]*)?(?:#[^\s"'<>]*)?)`) var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`) var xspfPattern = regexp.MustCompile(`(?i)([^<]+)`) // ExtractStreams returns the unique streaming URLs found in the provided HTML/text. func ExtractStreams(data string) []string { candidates := make(map[string]struct{}) special := make(map[string]struct{}) add := func(raw string) { if normalized, ok := normalizeCandidate(raw); ok { candidates[normalized] = struct{}{} } } addSpecial := func(raw string) { if normalized, ok := normalizeCandidate(raw); ok { candidates[normalized] = struct{}{} special[normalized] = struct{}{} } } for _, match := range urlPattern.FindAllStringSubmatch(data, -1) { add(match[1]) } for _, match := range attrPattern.FindAllStringSubmatch(data, -1) { add(match[2]) } for _, u := range extractTagAttrs(data, "audio", "src", "data-src") { addSpecial(u) } for _, u := range extractTagAttrs(data, "source", "src", "data-src") { addSpecial(u) } for _, u := range extractTagAttrs(data, "a", "href") { add(u) } streams := make([]string, 0, len(candidates)) for u := range candidates { if isStreamURL(u) { streams = append(streams, u) continue } if _, ok := special[u]; ok { streams = append(streams, u) } } sort.Strings(streams) return streams } // ExtractPlaylistLinks returns URLs likely pointing to playlists (m3u/pls/xspf/json). func ExtractPlaylistLinks(data string) []string { candidates := make(map[string]struct{}) add := func(raw string) { if normalized, ok := normalizeCandidate(raw); ok && isPlaylistURL(normalized) { candidates[normalized] = struct{}{} } } for _, match := range urlPattern.FindAllStringSubmatch(data, -1) { add(match[1]) } for _, match := range attrPattern.FindAllStringSubmatch(data, -1) { add(match[2]) } for _, u := range extractTagAttrs(data, "a", "href") { add(u) } for _, u := range extractTagAttrs(data, "source", "src", "data-src") { add(u) } links := make([]string, 0, len(candidates)) for u := range candidates { links = append(links, u) } sort.Strings(links) return links } // ExtractEmbedURLs returns URLs found in iframe embeds. func ExtractEmbedURLs(data string) []string { return extractTagAttrs(data, "iframe", "src") } // ExtractScriptURLs returns URLs referenced by script tags. func ExtractScriptURLs(data string) []string { return extractTagAttrs(data, "script", "src") } // ParsePlaylist extracts stream URLs from playlist content. func ParsePlaylist(content string, contentType string, baseURL string) []string { candidates := make(map[string]struct{}) add := func(raw string) { raw = strings.TrimSpace(raw) if raw == "" { return } if strings.HasPrefix(raw, "//") { raw = "https:" + raw } if isStreamURL(raw) { if resolved := resolveRelative(raw, baseURL); resolved != "" { candidates[resolved] = struct{}{} } } } addForce := func(raw string) { raw = strings.TrimSpace(raw) if raw == "" { return } if strings.HasPrefix(raw, "//") { raw = "https:" + raw } if resolved := resolveRelative(raw, baseURL); resolved != "" { candidates[resolved] = struct{}{} } } lowerType := strings.ToLower(contentType) lines := strings.Split(content, "\n") if strings.Contains(lowerType, "xspf") || strings.Contains(strings.ToLower(content), "") { for _, match := range xspfPattern.FindAllStringSubmatch(content, -1) { add(match[1]) } } for _, match := range urlPattern.FindAllStringSubmatch(content, -1) { add(match[1]) } for _, line := range lines { line = strings.TrimSpace(line) if line == "" || strings.HasPrefix(line, "#") { continue } if strings.HasPrefix(strings.ToLower(line), "file") && strings.Contains(line, "=") { parts := strings.SplitN(line, "=", 2) add(parts[1]) continue } if strings.Contains(line, "http") { matched := false for _, match := range urlPattern.FindAllStringSubmatch(line, -1) { add(match[1]) matched = true } if !matched { addForce(line) } continue } if baseURL != "" && (strings.Contains(strings.ToLower(line), ".mp3") || strings.Contains(strings.ToLower(line), ".aac") || strings.Contains(strings.ToLower(line), ".m3u8") || strings.Contains(strings.ToLower(line), ".ogg") || strings.Contains(strings.ToLower(line), ".opus")) { addForce(line) } } if strings.Contains(lowerType, "json") { for _, match := range urlPattern.FindAllStringSubmatch(content, -1) { add(match[1]) } } streams := make([]string, 0, len(candidates)) for u := range candidates { streams = append(streams, u) } sort.Strings(streams) return streams } func extractTagAttrs(data string, tag string, attrs ...string) []string { attrSet := make(map[string]struct{}, len(attrs)) for _, a := range attrs { attrSet[strings.ToLower(a)] = struct{}{} } candidates := make(map[string]struct{}) z := html.NewTokenizer(strings.NewReader(data)) for { tt := z.Next() switch tt { case html.ErrorToken: if z.Err() == io.EOF { urls := make([]string, 0, len(candidates)) for u := range candidates { urls = append(urls, u) } sort.Strings(urls) return urls } return nil case html.StartTagToken, html.SelfClosingTagToken: name, hasAttr := z.TagName() if !strings.EqualFold(string(name), tag) || !hasAttr { continue } for { key, val, more := z.TagAttr() if _, ok := attrSet[strings.ToLower(string(key))]; ok { if normalized, ok := normalizeCandidate(string(val)); ok { candidates[normalized] = struct{}{} } } if !more { break } } } } } func normalizeCandidate(raw string) (string, bool) { raw = strings.TrimSpace(raw) if raw == "" { return "", false } if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//") || strings.HasPrefix(raw, "/")) { return "", false } if strings.HasPrefix(raw, "//") { raw = "https:" + raw } normalized := strings.TrimRight(raw, "+") normalized = strings.ReplaceAll(normalized, `\\`, "") if normalized == "" { return "", false } return normalized, true } func resolveRelative(raw string, base string) string { raw = strings.TrimSpace(raw) if raw == "" { return "" } if base == "" { return raw } if strings.HasPrefix(raw, "http://") || strings.HasPrefix(raw, "https://") { return raw } if strings.HasPrefix(raw, "//") { return "https:" + raw } return ResolveURL(base, raw) } // ResolveURL resolves a possibly relative URL against a base. func ResolveURL(base string, href string) string { href = strings.TrimSpace(href) if href == "" { return "" } parsed, err := url.Parse(href) if err != nil { return "" } if parsed.IsAbs() { return parsed.String() } baseURL, err := url.Parse(base) if err != nil { return parsed.String() } return baseURL.ResolveReference(parsed).String() } func isStreamURL(u string) bool { lower := strings.ToLower(u) return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") || strings.Contains(lower, ".ogg") || strings.Contains(lower, ".opus") } func isPlaylistURL(u string) bool { lower := strings.ToLower(u) return strings.Contains(lower, ".m3u") || strings.Contains(lower, ".pls") || strings.Contains(lower, ".xspf") || strings.Contains(lower, ".json") }