選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

293 行
7.5KB

  1. package extractor
  2. import (
  3. "io"
  4. "net/url"
  5. "regexp"
  6. "sort"
  7. "strings"
  8. "golang.org/x/net/html"
  9. )
  10. var urlPattern = regexp.MustCompile(`(?i)((?:https?:)?\/\/[^\s"'<>]+\.(mp3|aac|m3u8|ogg|opus|pls|m3u|xspf|json)(?:\?[^\s"'<>]*)?(?:#[^\s"'<>]*)?)`)
  11. var attrPattern = regexp.MustCompile(`(?i)(streamsrc|streamhash|stream|audioUrl|mp3Url|hls|playlist|source)\s*[:=]\s*['"]([^'"]+)['"]`)
  12. var xspfPattern = regexp.MustCompile(`(?i)<location>([^<]+)</location>`)
  13. // ExtractStreams returns the unique streaming URLs found in the provided HTML/text.
  14. func ExtractStreams(data string) []string {
  15. candidates := make(map[string]struct{})
  16. special := make(map[string]struct{})
  17. add := func(raw string) {
  18. if normalized, ok := normalizeCandidate(raw); ok {
  19. candidates[normalized] = struct{}{}
  20. }
  21. }
  22. addSpecial := func(raw string) {
  23. if normalized, ok := normalizeCandidate(raw); ok {
  24. candidates[normalized] = struct{}{}
  25. special[normalized] = struct{}{}
  26. }
  27. }
  28. for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
  29. add(match[1])
  30. }
  31. for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
  32. add(match[2])
  33. }
  34. for _, u := range extractTagAttrs(data, "audio", "src", "data-src") {
  35. addSpecial(u)
  36. }
  37. for _, u := range extractTagAttrs(data, "source", "src", "data-src") {
  38. addSpecial(u)
  39. }
  40. for _, u := range extractTagAttrs(data, "a", "href") {
  41. add(u)
  42. }
  43. streams := make([]string, 0, len(candidates))
  44. for u := range candidates {
  45. if isStreamURL(u) {
  46. streams = append(streams, u)
  47. continue
  48. }
  49. if _, ok := special[u]; ok {
  50. streams = append(streams, u)
  51. }
  52. }
  53. sort.Strings(streams)
  54. return streams
  55. }
  56. // ExtractPlaylistLinks returns URLs likely pointing to playlists (m3u/pls/xspf/json).
  57. func ExtractPlaylistLinks(data string) []string {
  58. candidates := make(map[string]struct{})
  59. add := func(raw string) {
  60. if normalized, ok := normalizeCandidate(raw); ok && isPlaylistURL(normalized) {
  61. candidates[normalized] = struct{}{}
  62. }
  63. }
  64. for _, match := range urlPattern.FindAllStringSubmatch(data, -1) {
  65. add(match[1])
  66. }
  67. for _, match := range attrPattern.FindAllStringSubmatch(data, -1) {
  68. add(match[2])
  69. }
  70. for _, u := range extractTagAttrs(data, "a", "href") {
  71. add(u)
  72. }
  73. for _, u := range extractTagAttrs(data, "source", "src", "data-src") {
  74. add(u)
  75. }
  76. links := make([]string, 0, len(candidates))
  77. for u := range candidates {
  78. links = append(links, u)
  79. }
  80. sort.Strings(links)
  81. return links
  82. }
  83. // ExtractEmbedURLs returns URLs found in iframe embeds.
  84. func ExtractEmbedURLs(data string) []string {
  85. return extractTagAttrs(data, "iframe", "src")
  86. }
  87. // ExtractScriptURLs returns URLs referenced by script tags.
  88. func ExtractScriptURLs(data string) []string {
  89. return extractTagAttrs(data, "script", "src")
  90. }
  91. // ParsePlaylist extracts stream URLs from playlist content.
  92. func ParsePlaylist(content string, contentType string, baseURL string) []string {
  93. candidates := make(map[string]struct{})
  94. add := func(raw string) {
  95. raw = strings.TrimSpace(raw)
  96. if raw == "" {
  97. return
  98. }
  99. if strings.HasPrefix(raw, "//") {
  100. raw = "https:" + raw
  101. }
  102. if isStreamURL(raw) {
  103. if resolved := resolveRelative(raw, baseURL); resolved != "" {
  104. candidates[resolved] = struct{}{}
  105. }
  106. }
  107. }
  108. addForce := func(raw string) {
  109. raw = strings.TrimSpace(raw)
  110. if raw == "" {
  111. return
  112. }
  113. if strings.HasPrefix(raw, "//") {
  114. raw = "https:" + raw
  115. }
  116. if resolved := resolveRelative(raw, baseURL); resolved != "" {
  117. candidates[resolved] = struct{}{}
  118. }
  119. }
  120. lowerType := strings.ToLower(contentType)
  121. lines := strings.Split(content, "\n")
  122. if strings.Contains(lowerType, "xspf") || strings.Contains(strings.ToLower(content), "<location>") {
  123. for _, match := range xspfPattern.FindAllStringSubmatch(content, -1) {
  124. add(match[1])
  125. }
  126. }
  127. for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
  128. add(match[1])
  129. }
  130. for _, line := range lines {
  131. line = strings.TrimSpace(line)
  132. if line == "" || strings.HasPrefix(line, "#") {
  133. continue
  134. }
  135. if strings.HasPrefix(strings.ToLower(line), "file") && strings.Contains(line, "=") {
  136. parts := strings.SplitN(line, "=", 2)
  137. add(parts[1])
  138. continue
  139. }
  140. if strings.Contains(line, "http") {
  141. matched := false
  142. for _, match := range urlPattern.FindAllStringSubmatch(line, -1) {
  143. add(match[1])
  144. matched = true
  145. }
  146. if !matched {
  147. addForce(line)
  148. }
  149. continue
  150. }
  151. if baseURL != "" && (strings.Contains(strings.ToLower(line), ".mp3") || strings.Contains(strings.ToLower(line), ".aac") || strings.Contains(strings.ToLower(line), ".m3u8") || strings.Contains(strings.ToLower(line), ".ogg") || strings.Contains(strings.ToLower(line), ".opus")) {
  152. addForce(line)
  153. }
  154. }
  155. if strings.Contains(lowerType, "json") {
  156. for _, match := range urlPattern.FindAllStringSubmatch(content, -1) {
  157. add(match[1])
  158. }
  159. }
  160. streams := make([]string, 0, len(candidates))
  161. for u := range candidates {
  162. streams = append(streams, u)
  163. }
  164. sort.Strings(streams)
  165. return streams
  166. }
  167. func extractTagAttrs(data string, tag string, attrs ...string) []string {
  168. attrSet := make(map[string]struct{}, len(attrs))
  169. for _, a := range attrs {
  170. attrSet[strings.ToLower(a)] = struct{}{}
  171. }
  172. candidates := make(map[string]struct{})
  173. z := html.NewTokenizer(strings.NewReader(data))
  174. for {
  175. tt := z.Next()
  176. switch tt {
  177. case html.ErrorToken:
  178. if z.Err() == io.EOF {
  179. urls := make([]string, 0, len(candidates))
  180. for u := range candidates {
  181. urls = append(urls, u)
  182. }
  183. sort.Strings(urls)
  184. return urls
  185. }
  186. return nil
  187. case html.StartTagToken, html.SelfClosingTagToken:
  188. name, hasAttr := z.TagName()
  189. if !strings.EqualFold(string(name), tag) || !hasAttr {
  190. continue
  191. }
  192. for {
  193. key, val, more := z.TagAttr()
  194. if _, ok := attrSet[strings.ToLower(string(key))]; ok {
  195. if normalized, ok := normalizeCandidate(string(val)); ok {
  196. candidates[normalized] = struct{}{}
  197. }
  198. }
  199. if !more {
  200. break
  201. }
  202. }
  203. }
  204. }
  205. }
  206. func normalizeCandidate(raw string) (string, bool) {
  207. raw = strings.TrimSpace(raw)
  208. if raw == "" {
  209. return "", false
  210. }
  211. if !(strings.Contains(raw, "http") || strings.HasPrefix(raw, "//") || strings.HasPrefix(raw, "/")) {
  212. return "", false
  213. }
  214. if strings.HasPrefix(raw, "//") {
  215. raw = "https:" + raw
  216. }
  217. normalized := strings.TrimRight(raw, "+")
  218. normalized = strings.ReplaceAll(normalized, `\\`, "")
  219. if normalized == "" {
  220. return "", false
  221. }
  222. return normalized, true
  223. }
  224. func resolveRelative(raw string, base string) string {
  225. raw = strings.TrimSpace(raw)
  226. if raw == "" {
  227. return ""
  228. }
  229. if base == "" {
  230. return raw
  231. }
  232. if strings.HasPrefix(raw, "http://") || strings.HasPrefix(raw, "https://") {
  233. return raw
  234. }
  235. if strings.HasPrefix(raw, "//") {
  236. return "https:" + raw
  237. }
  238. return ResolveURL(base, raw)
  239. }
  240. // ResolveURL resolves a possibly relative URL against a base.
  241. func ResolveURL(base string, href string) string {
  242. href = strings.TrimSpace(href)
  243. if href == "" {
  244. return ""
  245. }
  246. parsed, err := url.Parse(href)
  247. if err != nil {
  248. return ""
  249. }
  250. if parsed.IsAbs() {
  251. return parsed.String()
  252. }
  253. baseURL, err := url.Parse(base)
  254. if err != nil {
  255. return parsed.String()
  256. }
  257. return baseURL.ResolveReference(parsed).String()
  258. }
  259. func isStreamURL(u string) bool {
  260. lower := strings.ToLower(u)
  261. return strings.Contains(lower, ".mp3") || strings.Contains(lower, ".aac") || strings.Contains(lower, ".m3u8") ||
  262. strings.Contains(lower, ".ogg") || strings.Contains(lower, ".opus")
  263. }
  264. func isPlaylistURL(u string) bool {
  265. lower := strings.ToLower(u)
  266. return strings.Contains(lower, ".m3u") || strings.Contains(lower, ".pls") ||
  267. strings.Contains(lower, ".xspf") || strings.Contains(lower, ".json")
  268. }