实战攻略:如何爬取电影天堂的子页面内容?(附13-re示例)
最编程
2024-02-04 16:04:32
...
# 先进入到电影天堂首页,可以看到2021必看热片模块
# 随便点击一个连接,会再打开一个网站,网站下面有下载地址,我们要爬取这个下载地址
import requests
import re
url = "https://dytt89.com/"
headers = {
"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Mobile Safari/537.36 Edg/96.0.1054.43"
}
resp = requests.get(url, headers=headers) # verify=False关闭安全验证
resp.encoding = "gb2312"
html = resp.text
# 拿到ul中的li
obj1 = re.compile(r"2021必看热片.*?<ul>(?P<ul>.*?)</ul>", re.S)
obj2 = re.compile(r"<a href='/(?P<href>.*?)'", re.S)
obj3 = re.compile(r'<title>(?P<movie>.*?)</title>.*?<td '
r'style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<aaa>.*?)">', re.S)
result = obj1.finditer(html)
child_href = []
for item in result:
ul = item.group("ul")
# 提取子页面链接
result1 = obj2.finditer(ul)
for item1 in result1:
href = item1.group("href")
website = url + href # 这个是子页面的网站,下载链接都在在这个网站里面
child_href.append(website)
for website in child_href:
child_resp = requests.get(website, headers=headers, verify=False)
child_resp.encoding = "gb2312"
result3 = obj3.search(child_resp.text)
print(result3.group("movie"))
print(result3.group("aaa"))
resp.close()