|
- package extractor
-
- import (
- "regexp"
- "sort"
- "strings"
- )
-
- var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json))`)
- var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`)
- var srcPattern = regexp.MustCompile(`(?i)src\s*=\s*['"]([^'"]+)['"]`)
- var iframePattern = regexp.MustCompile(`(?i)<iframe[^>]+src\s*=\s*['"]([^'"]+)['"]`)
- var scriptPattern = regexp.MustCompile(`(?i)<script[^>]+src\s*=\s*['"]([^'"]+)['"]`)
- var audioPattern = regexp.MustCompile(`(?i)<audio[^>]+src\s*=\s*['"]([^'"]+)['"]`)
- var sourcePattern = regexp.MustCompile(`(?i)<source[^>]+src\s*=\s*['"]([^'"]+)['"]`)
- var xspfPattern = regexp.MustCompile(`(?i)<location>([^<]+)</location>`)
-
- // ExtractStreams returns the unique streaming URLs found in the provided HTML/text.
- func ExtractStreams(data string) []string {
- candidates := make(map[string]struct{})
- special := make(map[string]struct{})
- add := func(raw string) {
- if normalized, ok := normalizeCandidate(raw); ok {
- candidates[normalized] = struct{}{}
- }
- }
- addSpecial := func(raw string) {
- if normalized, ok := normalizeCandidate(raw); ok {
- candidates[normalized] = struct{}{}
- special[normalized] = struct{}{}
- }
- }
-
- for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
- add(match[1])
- }
- for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
- add(match[2])
- }
- for _, match := range srcPattern.FindAllStringSubmatch(data, -1) {
- add(match[1])
- }
- for _, match := range audioPattern.FindAllStringSubmatch(data, -1) {
- addSpecial(match[1])
- }
- for _, match := range sourcePattern.FindAllStringSubmatch(data, -1) {
- addSpecial(match[1])
- }
-
- streams := make([]string, 0, len(candidates))
- for u := range candidates {
- if isStreamURL(u) {
- streams = append(streams, u)
- continue
- }
- if _, ok := special[u]; ok {
- streams = append(streams, u)
- }
- }
- sort.Strings(streams)
- return streams
- }
-
- // ExtractPlaylistLinks returns URLs likely pointing to playlists (m3u/pls/xspf/json).
- func ExtractPlaylistLinks(data string) []string {
- candidates := make(map[string]struct{})
- add := func(raw string) {
- if normalized, ok := normalizeCandidate(raw); ok && isPlaylistURL(normalized) {
- candidates[normalized] = struct{}{}
- }
- }
-
- for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
- add(match[1])
- }
- for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
- add(match[2])
- }
- for _, match := range srcPattern.FindAllStringSubmatch(data, -1) {
- add(match[1])
- }
-
- links := make([]string, 0, len(candidates))
- for u := range candidates {
- links = append(links, u)
- }
- sort.Strings(links)
- return links
- }
-
- // ExtractEmbedURLs returns URLs found in iframe embeds.
- func ExtractEmbedURLs(data string) []string {
- return extractURLs(iframePattern, data)
- }
-
- // ExtractScriptURLs returns URLs referenced by script tags.
- func ExtractScriptURLs(data string) []string {
- return extractURLs(scriptPattern, data)
- }
-
- // ParsePlaylist extracts stream URLs from playlist content.
- func ParsePlaylist(content string, contentType string) []string {
- candidates := make(map[string]struct{})
- add := func(raw string) {
- raw = strings.TrimSpace(raw)
- if raw == "" {
- return
- }
- if strings.HasPrefix(raw, "//") {
- raw = "https:" + raw
- }
- if isStreamURL(raw) {
- candidates[raw] = struct{}{}
- }
- }
- addForce := func(raw string) {
- raw = strings.TrimSpace(raw)
- if raw == "" {
- return
- }
- if strings.HasPrefix(raw, "//") {
- raw = "https:" + raw
- }
- candidates[raw] = struct{}{}
- }
-
- lowerType := strings.ToLower(contentType)
- lines := strings.Split(content, "\n")
-
- if strings.Contains(lowerType, "xspf") || strings.Contains(strings.ToLower(content), "<location>") {
- for _, match := range xspfPattern.FindAllStringSubmatch(content, -1) {
- add(match[1])
- }
- }
-
- for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
- add(match[1])
- }
-
- for _, line := range lines {
- line = strings.TrimSpace(line)
- if line == "" || strings.HasPrefix(line, "#") {
- continue
- }
- if strings.HasPrefix(strings.ToLower(line), "file") && strings.Contains(line, "=") {
- parts := strings.SplitN(line, "=", 2)
- add(parts[1])
- continue
- }
- if strings.Contains(line, "http") {
- matched := false
- for _, match := range urlPattern.FindAllStringSubmatch(line, -1) {
- add(match[1])
- matched = true
- }
- if !matched {
- addForce(line)
- }
- }
- }
-
- if strings.Contains(lowerType, "json") {
- for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
- add(match[1])
- }
- }
-
- streams := make([]string, 0, len(candidates))
- for u := range candidates {
- streams = append(streams, u)
- }
- sort.Strings(streams)
- return streams
- }
- func extractURLs(pattern *regexp.Regexp, data string) []string {
- candidates := make(map[string]struct{})
- for _, match := range pattern.FindAllStringSubmatch(data, -1) {
- if normalized, ok := normalizeCandidate(match[1]); ok {
- candidates[normalized] = struct{}{}
- }
- }
-
- urls := make([]string, 0, len(candidates))
- for u := range candidates {
- urls = append(urls, u)
- }
- sort.Strings(urls)
- return urls
- }
-
- func normalizeCandidate(raw string) (string, bool) {
- raw = strings.TrimSpace(raw)
- if raw == "" {
- return "", false
- }
- if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//") || strings.HasPrefix(raw, "/")) {
- return "", false
- }
- if strings.HasPrefix(raw, "//") {
- raw = "https:" + raw
- }
- normalized := strings.TrimRight(raw, "+")
- normalized = strings.ReplaceAll(normalized, `\\`, "")
- if normalized == "" {
- return "", false
- }
- return normalized, true
- }
-
- func isStreamURL(u string) bool {
- lower := strings.ToLower(u)
- return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") ||
- strings.Contains(lower, ".ogg") || strings.Contains(lower, ".opus")
- }
-
- func isPlaylistURL(u string) bool {
- lower := strings.ToLower(u)
- return strings.Contains(lower, ".m3u") || strings.Contains(lower, ".pls") ||
- strings.Contains(lower, ".xspf") || strings.Contains(lower, ".json")
- }
|