欢迎您访问 最编程 本站为您分享编程语言代码,编程技术文章!
您现在的位置是: 首页

实战攻略:如何爬取电影天堂的子页面内容?(附13-re示例)

最编程 2024-02-04 16:04:32
...

# 先进入到电影天堂首页,可以看到2021必看热片模块
# 随便点击一个连接,会再打开一个网站,网站下面有下载地址,我们要爬取这个下载地址

import requests
import re

url = "https://dytt89.com/"

headers = {
    "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Mobile Safari/537.36 Edg/96.0.1054.43"
}

resp = requests.get(url, headers=headers)  # verify=False关闭安全验证
resp.encoding = "gb2312"

html = resp.text

# 拿到ul中的li
obj1 = re.compile(r"2021必看热片.*?<ul>(?P<ul>.*?)</ul>", re.S)
obj2 = re.compile(r"<a href='/(?P<href>.*?)'", re.S)
obj3 = re.compile(r'<title>(?P<movie>.*?)</title>.*?<td '
                  r'style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<aaa>.*?)">', re.S)


result = obj1.finditer(html)

child_href = []
for item in result:
    ul = item.group("ul")

    # 提取子页面链接
    result1 = obj2.finditer(ul)
    for item1 in result1:
        href = item1.group("href")
        website = url + href  # 这个是子页面的网站,下载链接都在在这个网站里面
        child_href.append(website)


for website in child_href:
    child_resp = requests.get(website, headers=headers, verify=False)
    child_resp.encoding = "gb2312"
    result3 = obj3.search(child_resp.text)
    print(result3.group("movie"))
    print(result3.group("aaa"))

resp.close()