欢迎您访问 最编程 本站为您分享编程语言代码,编程技术文章!
您现在的位置是: 首页

Python爬取天堂网危险蘑菇图片

最编程 2024-02-04 16:01:42
...
import requests from bs4 import BeautifulSoup import urllib import urllib.request def cbk(a, b, c): '''''回调函数 @a:已经下载的数据块 @b:数据块的大小 @c:远程文件的大小 ''' per = 100.0 * a * b / c if per > 100: per = 100 print('%.2f%%' % per) print(" ") url = 'https://www.ivsky.com/tupian/dumogu_t28446/' # 取一个图片目录 有毒蘑菇 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400', 'Referer': 'http://www.ivsky.com/tupian/qita/index_11.html'} html = requests.get(url, headers=headers, timeout=800) # 获取网页内容 soup = BeautifulSoup(html.text, 'html.parser') def spidertupian(): for i in range(1, 12): link = url + '/index_' + str(i) + '.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400', 'Referer': 'http://www.ivsky.com/tupian/qita/index_11.html'} html = requests.get(link, headers=headers) mess = BeautifulSoup(html.text, 'html.parser') # 查找标签为'ul', class属性为'pli'的标签元素,因为class是python的关键字,所以这里需要加个下划线'_' for page in mess.find_all('ul', class_='pli'): x = 0 for img in page.find_all('img'): # 文件夹的url # print(img) imgurl = img.get('src') # 获取src字段 save_path = "E:/the_data/mogo/" + str(i) + "_" + str(x) + ".jpg" # 拼接图片保存路径 imghttp = 'https:' + imgurl # 拼按图片的url路径 urllib.request.urlretrieve(imghttp, save_path, cbk) x += 1 if __name__ == '__main__': spidertupian()