|
- package extractor
-
- import (
- "io"
- "net/url"
- "regexp"
- "sort"
- "strings"
-
- "golang.org/x/net/html"
- )
-
- var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json)(?:\?[^\s"'<>]*)?(?:#[^\s"'<>]*)?)`)
- var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`)
- var xspfPattern = regexp.MustCompile(`(?i)<location>([^<]+)</location>`)
-
- // ExtractStreams returns the unique streaming URLs found in the provided HTML/text.
- func ExtractStreams(data string) []string {
- candidates := make(map[string]struct{})
- special := make(map[string]struct{})
- add := func(raw string) {
- if normalized, ok := normalizeCandidate(raw); ok {
- candidates[normalized] = struct{}{}
- }
- }
- addSpecial := func(raw string) {
- if normalized, ok := normalizeCandidate(raw); ok {
- candidates[normalized] = struct{}{}
- special[normalized] = struct{}{}
- }
- }
-
- for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
- add(match[1])
- }
- for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
- add(match[2])
- }
- for _, u := range extractTagAttrs(data, "audio", "src", "data-src") {
- addSpecial(u)
- }
- for _, u := range extractTagAttrs(data, "source", "src", "data-src") {
- addSpecial(u)
- }
- for _, u := range extractTagAttrs(data, "a", "href") {
- add(u)
- }
-
- streams := make([]string, 0, len(candidates))
- for u := range candidates {
- if isStreamURL(u) {
- streams = append(streams, u)
- continue
- }
- if _, ok := special[u]; ok {
- streams = append(streams, u)
- }
- }
- sort.Strings(streams)
- return streams
- }
-
- // ExtractPlaylistLinks returns URLs likely pointing to playlists (m3u/pls/xspf/json).
- func ExtractPlaylistLinks(data string) []string {
- candidates := make(map[string]struct{})
- add := func(raw string) {
- if normalized, ok := normalizeCandidate(raw); ok && isPlaylistURL(normalized) {
- candidates[normalized] = struct{}{}
- }
- }
-
- for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
- add(match[1])
- }
- for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
- add(match[2])
- }
- for _, u := range extractTagAttrs(data, "a", "href") {
- add(u)
- }
- for _, u := range extractTagAttrs(data, "source", "src", "data-src") {
- add(u)
- }
-
- links := make([]string, 0, len(candidates))
- for u := range candidates {
- links = append(links, u)
- }
- sort.Strings(links)
- return links
- }
-
- // ExtractEmbedURLs returns URLs found in iframe embeds.
- func ExtractEmbedURLs(data string) []string {
- return extractTagAttrs(data, "iframe", "src")
- }
-
- // ExtractScriptURLs returns URLs referenced by script tags.
- func ExtractScriptURLs(data string) []string {
- return extractTagAttrs(data, "script", "src")
- }
-
- // ParsePlaylist extracts stream URLs from playlist content.
- func ParsePlaylist(content string, contentType string, baseURL string) []string {
- candidates := make(map[string]struct{})
- add := func(raw string) {
- raw = strings.TrimSpace(raw)
- if raw == "" {
- return
- }
- if strings.HasPrefix(raw, "//") {
- raw = "https:" + raw
- }
- if isStreamURL(raw) {
- if resolved := resolveRelative(raw, baseURL); resolved != "" {
- candidates[resolved] = struct{}{}
- }
- }
- }
- addForce := func(raw string) {
- raw = strings.TrimSpace(raw)
- if raw == "" {
- return
- }
- if strings.HasPrefix(raw, "//") {
- raw = "https:" + raw
- }
- if resolved := resolveRelative(raw, baseURL); resolved != "" {
- candidates[resolved] = struct{}{}
- }
- }
-
- lowerType := strings.ToLower(contentType)
- lines := strings.Split(content, "\n")
-
- if strings.Contains(lowerType, "xspf") || strings.Contains(strings.ToLower(content), "<location>") {
- for _, match := range xspfPattern.FindAllStringSubmatch(content, -1) {
- add(match[1])
- }
- }
-
- for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
- add(match[1])
- }
-
- for _, line := range lines {
- line = strings.TrimSpace(line)
- if line == "" || strings.HasPrefix(line, "#") {
- continue
- }
- if strings.HasPrefix(strings.ToLower(line), "file") && strings.Contains(line, "=") {
- parts := strings.SplitN(line, "=", 2)
- add(parts[1])
- continue
- }
- if strings.Contains(line, "http") {
- matched := false
- for _, match := range urlPattern.FindAllStringSubmatch(line, -1) {
- add(match[1])
- matched = true
- }
- if !matched {
- addForce(line)
- }
- continue
- }
- if baseURL != "" && (strings.Contains(strings.ToLower(line), ".mp3") || strings.Contains(strings.ToLower(line), ".aac") || strings.Contains(strings.ToLower(line), ".m3u8") || strings.Contains(strings.ToLower(line), ".ogg") || strings.Contains(strings.ToLower(line), ".opus")) {
- addForce(line)
- }
- }
-
- if strings.Contains(lowerType, "json") {
- for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
- add(match[1])
- }
- }
-
- streams := make([]string, 0, len(candidates))
- for u := range candidates {
- streams = append(streams, u)
- }
- sort.Strings(streams)
- return streams
- }
- func extractTagAttrs(data string, tag string, attrs ...string) []string {
- attrSet := make(map[string]struct{}, len(attrs))
- for _, a := range attrs {
- attrSet[strings.ToLower(a)] = struct{}{}
- }
-
- candidates := make(map[string]struct{})
- z := html.NewTokenizer(strings.NewReader(data))
- for {
- tt := z.Next()
- switch tt {
- case html.ErrorToken:
- if z.Err() == io.EOF {
- urls := make([]string, 0, len(candidates))
- for u := range candidates {
- urls = append(urls, u)
- }
- sort.Strings(urls)
- return urls
- }
- return nil
- case html.StartTagToken, html.SelfClosingTagToken:
- name, hasAttr := z.TagName()
- if !strings.EqualFold(string(name), tag) || !hasAttr {
- continue
- }
- for {
- key, val, more := z.TagAttr()
- if _, ok := attrSet[strings.ToLower(string(key))]; ok {
- if normalized, ok := normalizeCandidate(string(val)); ok {
- candidates[normalized] = struct{}{}
- }
- }
- if !more {
- break
- }
- }
- }
- }
- }
-
- func normalizeCandidate(raw string) (string, bool) {
- raw = strings.TrimSpace(raw)
- if raw == "" {
- return "", false
- }
- if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//") || strings.HasPrefix(raw, "/")) {
- return "", false
- }
- if strings.HasPrefix(raw, "//") {
- raw = "https:" + raw
- }
- normalized := strings.TrimRight(raw, "+")
- normalized = strings.ReplaceAll(normalized, `\\`, "")
- if normalized == "" {
- return "", false
- }
- return normalized, true
- }
-
- func resolveRelative(raw string, base string) string {
- raw = strings.TrimSpace(raw)
- if raw == "" {
- return ""
- }
- if base == "" {
- return raw
- }
- if strings.HasPrefix(raw, "http://") || strings.HasPrefix(raw, "https://") {
- return raw
- }
- if strings.HasPrefix(raw, "//") {
- return "https:" + raw
- }
- return ResolveURL(base, raw)
- }
-
- // ResolveURL resolves a possibly relative URL against a base.
- func ResolveURL(base string, href string) string {
- href = strings.TrimSpace(href)
- if href == "" {
- return ""
- }
- parsed, err := url.Parse(href)
- if err != nil {
- return ""
- }
- if parsed.IsAbs() {
- return parsed.String()
- }
- baseURL, err := url.Parse(base)
- if err != nil {
- return parsed.String()
- }
- return baseURL.ResolveReference(parsed).String()
- }
-
- func isStreamURL(u string) bool {
- lower := strings.ToLower(u)
- return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") ||
- strings.Contains(lower, ".ogg") || strings.Contains(lower, ".opus")
- }
-
- func isPlaylistURL(u string) bool {
- lower := strings.ToLower(u)
- return strings.Contains(lower, ".m3u") || strings.Contains(lower, ".pls") ||
- strings.Contains(lower, ".xspf") || strings.Contains(lower, ".json")
- }
|