package extractor
import (
"io"
"net/url"
"regexp"
"sort"
"strings"
"golang.org/x/net/html"
)
var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json)(?:\?[^\s"'<>]*)?(?:#[^\s"'<>]*)?)`)
var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`)
var xspfPattern = regexp.MustCompile(`(?i)([^<]+)`)
// ExtractStreams returns the unique streaming URLs found in the provided HTML/text.
func ExtractStreams(data string) []string {
candidates := make(map[string]struct{})
special := make(map[string]struct{})
add := func(raw string) {
if normalized, ok := normalizeCandidate(raw); ok {
candidates[normalized] = struct{}{}
}
}
addSpecial := func(raw string) {
if normalized, ok := normalizeCandidate(raw); ok {
candidates[normalized] = struct{}{}
special[normalized] = struct{}{}
}
}
for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
add(match[2])
}
for _, u := range extractTagAttrs(data, "audio", "src", "data-src") {
addSpecial(u)
}
for _, u := range extractTagAttrs(data, "source", "src", "data-src") {
addSpecial(u)
}
for _, u := range extractTagAttrs(data, "a", "href") {
add(u)
}
streams := make([]string, 0, len(candidates))
for u := range candidates {
if isStreamURL(u) {
streams = append(streams, u)
continue
}
if _, ok := special[u]; ok {
streams = append(streams, u)
}
}
sort.Strings(streams)
return streams
}
// ExtractPlaylistLinks returns URLs likely pointing to playlists (m3u/pls/xspf/json).
func ExtractPlaylistLinks(data string) []string {
candidates := make(map[string]struct{})
add := func(raw string) {
if normalized, ok := normalizeCandidate(raw); ok && isPlaylistURL(normalized) {
candidates[normalized] = struct{}{}
}
}
for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
add(match[1])
}
for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
add(match[2])
}
for _, u := range extractTagAttrs(data, "a", "href") {
add(u)
}
for _, u := range extractTagAttrs(data, "source", "src", "data-src") {
add(u)
}
links := make([]string, 0, len(candidates))
for u := range candidates {
links = append(links, u)
}
sort.Strings(links)
return links
}
// ExtractEmbedURLs returns URLs found in iframe embeds.
func ExtractEmbedURLs(data string) []string {
return extractTagAttrs(data, "iframe", "src")
}
// ExtractScriptURLs returns URLs referenced by script tags.
func ExtractScriptURLs(data string) []string {
return extractTagAttrs(data, "script", "src")
}
// ParsePlaylist extracts stream URLs from playlist content.
func ParsePlaylist(content string, contentType string, baseURL string) []string {
candidates := make(map[string]struct{})
add := func(raw string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
if isStreamURL(raw) {
if resolved := resolveRelative(raw, baseURL); resolved != "" {
candidates[resolved] = struct{}{}
}
}
}
addForce := func(raw string) {
raw = strings.TrimSpace(raw)
if raw == "" {
return
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
if resolved := resolveRelative(raw, baseURL); resolved != "" {
candidates[resolved] = struct{}{}
}
}
lowerType := strings.ToLower(contentType)
lines := strings.Split(content, "\n")
if strings.Contains(lowerType, "xspf") || strings.Contains(strings.ToLower(content), "") {
for _, match := range xspfPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}
}
for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
if strings.HasPrefix(strings.ToLower(line), "file") && strings.Contains(line, "=") {
parts := strings.SplitN(line, "=", 2)
add(parts[1])
continue
}
if strings.Contains(line, "http") {
matched := false
for _, match := range urlPattern.FindAllStringSubmatch(line, -1) {
add(match[1])
matched = true
}
if !matched {
addForce(line)
}
continue
}
if baseURL != "" && (strings.Contains(strings.ToLower(line), ".mp3") || strings.Contains(strings.ToLower(line), ".aac") || strings.Contains(strings.ToLower(line), ".m3u8") || strings.Contains(strings.ToLower(line), ".ogg") || strings.Contains(strings.ToLower(line), ".opus")) {
addForce(line)
}
}
if strings.Contains(lowerType, "json") {
for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
add(match[1])
}
}
streams := make([]string, 0, len(candidates))
for u := range candidates {
streams = append(streams, u)
}
sort.Strings(streams)
return streams
}
func extractTagAttrs(data string, tag string, attrs ...string) []string {
attrSet := make(map[string]struct{}, len(attrs))
for _, a := range attrs {
attrSet[strings.ToLower(a)] = struct{}{}
}
candidates := make(map[string]struct{})
z := html.NewTokenizer(strings.NewReader(data))
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
if z.Err() == io.EOF {
urls := make([]string, 0, len(candidates))
for u := range candidates {
urls = append(urls, u)
}
sort.Strings(urls)
return urls
}
return nil
case html.StartTagToken, html.SelfClosingTagToken:
name, hasAttr := z.TagName()
if !strings.EqualFold(string(name), tag) || !hasAttr {
continue
}
for {
key, val, more := z.TagAttr()
if _, ok := attrSet[strings.ToLower(string(key))]; ok {
if normalized, ok := normalizeCandidate(string(val)); ok {
candidates[normalized] = struct{}{}
}
}
if !more {
break
}
}
}
}
}
func normalizeCandidate(raw string) (string, bool) {
raw = strings.TrimSpace(raw)
if raw == "" {
return "", false
}
if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//") || strings.HasPrefix(raw, "/")) {
return "", false
}
if strings.HasPrefix(raw, "//") {
raw = "https:" + raw
}
normalized := strings.TrimRight(raw, "+")
normalized = strings.ReplaceAll(normalized, `\\`, "")
if normalized == "" {
return "", false
}
return normalized, true
}
func resolveRelative(raw string, base string) string {
raw = strings.TrimSpace(raw)
if raw == "" {
return ""
}
if base == "" {
return raw
}
if strings.HasPrefix(raw, "http://") || strings.HasPrefix(raw, "https://") {
return raw
}
if strings.HasPrefix(raw, "//") {
return "https:" + raw
}
return ResolveURL(base, raw)
}
// ResolveURL resolves a possibly relative URL against a base.
func ResolveURL(base string, href string) string {
href = strings.TrimSpace(href)
if href == "" {
return ""
}
parsed, err := url.Parse(href)
if err != nil {
return ""
}
if parsed.IsAbs() {
return parsed.String()
}
baseURL, err := url.Parse(base)
if err != nil {
return parsed.String()
}
return baseURL.ResolveReference(parsed).String()
}
func isStreamURL(u string) bool {
lower := strings.ToLower(u)
return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") ||
strings.Contains(lower, ".ogg") || strings.Contains(lower, ".opus")
}
func isPlaylistURL(u string) bool {
lower := strings.ToLower(u)
return strings.Contains(lower, ".m3u") || strings.Contains(lower, ".pls") ||
strings.Contains(lower, ".xspf") || strings.Contains(lower, ".json")
}