選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

221 行
6.0KB

  1. package extractor
  2. import (
  3. "regexp"
  4. "sort"
  5. "strings"
  6. )
  7. var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json))`)
  8. var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`)
  9. var srcPattern = regexp.MustCompile(`(?i)src\s*=\s*['"]([^'"]+)['"]`)
  10. var iframePattern = regexp.MustCompile(`(?i)<iframe[^>]+src\s*=\s*['"]([^'"]+)['"]`)
  11. var scriptPattern = regexp.MustCompile(`(?i)<script[^>]+src\s*=\s*['"]([^'"]+)['"]`)
  12. var audioPattern = regexp.MustCompile(`(?i)<audio[^>]+src\s*=\s*['"]([^'"]+)['"]`)
  13. var sourcePattern = regexp.MustCompile(`(?i)<source[^>]+src\s*=\s*['"]([^'"]+)['"]`)
  14. var xspfPattern = regexp.MustCompile(`(?i)<location>([^<]+)</location>`)
  15. // ExtractStreams returns the unique streaming URLs found in the provided HTML/text.
  16. func ExtractStreams(data string) []string {
  17. candidates := make(map[string]struct{})
  18. special := make(map[string]struct{})
  19. add := func(raw string) {
  20. if normalized, ok := normalizeCandidate(raw); ok {
  21. candidates[normalized] = struct{}{}
  22. }
  23. }
  24. addSpecial := func(raw string) {
  25. if normalized, ok := normalizeCandidate(raw); ok {
  26. candidates[normalized] = struct{}{}
  27. special[normalized] = struct{}{}
  28. }
  29. }
  30. for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
  31. add(match[1])
  32. }
  33. for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
  34. add(match[2])
  35. }
  36. for _, match := range srcPattern.FindAllStringSubmatch(data, -1) {
  37. add(match[1])
  38. }
  39. for _, match := range audioPattern.FindAllStringSubmatch(data, -1) {
  40. addSpecial(match[1])
  41. }
  42. for _, match := range sourcePattern.FindAllStringSubmatch(data, -1) {
  43. addSpecial(match[1])
  44. }
  45. streams := make([]string, 0, len(candidates))
  46. for u := range candidates {
  47. if isStreamURL(u) {
  48. streams = append(streams, u)
  49. continue
  50. }
  51. if _, ok := special[u]; ok {
  52. streams = append(streams, u)
  53. }
  54. }
  55. sort.Strings(streams)
  56. return streams
  57. }
  58. // ExtractPlaylistLinks returns URLs likely pointing to playlists (m3u/pls/xspf/json).
  59. func ExtractPlaylistLinks(data string) []string {
  60. candidates := make(map[string]struct{})
  61. add := func(raw string) {
  62. if normalized, ok := normalizeCandidate(raw); ok && isPlaylistURL(normalized) {
  63. candidates[normalized] = struct{}{}
  64. }
  65. }
  66. for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
  67. add(match[1])
  68. }
  69. for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
  70. add(match[2])
  71. }
  72. for _, match := range srcPattern.FindAllStringSubmatch(data, -1) {
  73. add(match[1])
  74. }
  75. links := make([]string, 0, len(candidates))
  76. for u := range candidates {
  77. links = append(links, u)
  78. }
  79. sort.Strings(links)
  80. return links
  81. }
  82. // ExtractEmbedURLs returns URLs found in iframe embeds.
  83. func ExtractEmbedURLs(data string) []string {
  84. return extractURLs(iframePattern, data)
  85. }
  86. // ExtractScriptURLs returns URLs referenced by script tags.
  87. func ExtractScriptURLs(data string) []string {
  88. return extractURLs(scriptPattern, data)
  89. }
  90. // ParsePlaylist extracts stream URLs from playlist content.
  91. func ParsePlaylist(content string, contentType string) []string {
  92. candidates := make(map[string]struct{})
  93. add := func(raw string) {
  94. raw = strings.TrimSpace(raw)
  95. if raw == "" {
  96. return
  97. }
  98. if strings.HasPrefix(raw, "//") {
  99. raw = "https:" + raw
  100. }
  101. if isStreamURL(raw) {
  102. candidates[raw] = struct{}{}
  103. }
  104. }
  105. addForce := func(raw string) {
  106. raw = strings.TrimSpace(raw)
  107. if raw == "" {
  108. return
  109. }
  110. if strings.HasPrefix(raw, "//") {
  111. raw = "https:" + raw
  112. }
  113. candidates[raw] = struct{}{}
  114. }
  115. lowerType := strings.ToLower(contentType)
  116. lines := strings.Split(content, "\n")
  117. if strings.Contains(lowerType, "xspf") || strings.Contains(strings.ToLower(content), "<location>") {
  118. for _, match := range xspfPattern.FindAllStringSubmatch(content, -1) {
  119. add(match[1])
  120. }
  121. }
  122. for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
  123. add(match[1])
  124. }
  125. for _, line := range lines {
  126. line = strings.TrimSpace(line)
  127. if line == "" || strings.HasPrefix(line, "#") {
  128. continue
  129. }
  130. if strings.HasPrefix(strings.ToLower(line), "file") && strings.Contains(line, "=") {
  131. parts := strings.SplitN(line, "=", 2)
  132. add(parts[1])
  133. continue
  134. }
  135. if strings.Contains(line, "http") {
  136. matched := false
  137. for _, match := range urlPattern.FindAllStringSubmatch(line, -1) {
  138. add(match[1])
  139. matched = true
  140. }
  141. if !matched {
  142. addForce(line)
  143. }
  144. }
  145. }
  146. if strings.Contains(lowerType, "json") {
  147. for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
  148. add(match[1])
  149. }
  150. }
  151. streams := make([]string, 0, len(candidates))
  152. for u := range candidates {
  153. streams = append(streams, u)
  154. }
  155. sort.Strings(streams)
  156. return streams
  157. }
  158. func extractURLs(pattern *regexp.Regexp, data string) []string {
  159. candidates := make(map[string]struct{})
  160. for _, match := range pattern.FindAllStringSubmatch(data, -1) {
  161. if normalized, ok := normalizeCandidate(match[1]); ok {
  162. candidates[normalized] = struct{}{}
  163. }
  164. }
  165. urls := make([]string, 0, len(candidates))
  166. for u := range candidates {
  167. urls = append(urls, u)
  168. }
  169. sort.Strings(urls)
  170. return urls
  171. }
  172. func normalizeCandidate(raw string) (string, bool) {
  173. raw = strings.TrimSpace(raw)
  174. if raw == "" {
  175. return "", false
  176. }
  177. if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//") || strings.HasPrefix(raw, "/")) {
  178. return "", false
  179. }
  180. if strings.HasPrefix(raw, "//") {
  181. raw = "https:" + raw
  182. }
  183. normalized := strings.TrimRight(raw, "+")
  184. normalized = strings.ReplaceAll(normalized, `\\`, "")
  185. if normalized == "" {
  186. return "", false
  187. }
  188. return normalized, true
  189. }
  190. func isStreamURL(u string) bool {
  191. lower := strings.ToLower(u)
  192. return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") ||
  193. strings.Contains(lower, ".ogg") || strings.Contains(lower, ".opus")
  194. }
  195. func isPlaylistURL(u string) bool {
  196. lower := strings.ToLower(u)
  197. return strings.Contains(lower, ".m3u") || strings.Contains(lower, ".pls") ||
  198. strings.Contains(lower, ".xspf") || strings.Contains(lower, ".json")
  199. }