pufei漫画网爬虫:完整代码分享
最编程
2024-08-11 11:36:28
...
import os
import re
import threading
import time
from urllib.parse import quote
import execjs
import requests
from lxml import etree
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S', filemode='r')
# filemode='a', filename=os.path.join(os.getcwd(), 'log', 'mh.log'
class PuFei:
def __init__(self, keyboard):
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
" (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36"}
self.search_url = 'http://m.pufei.net/e/search/?'
self.keyboard = keyboard
def search(self):
url = self.search_url + 'searchget=1&tbname=mh&show=title,player,playadmin,bieming,pinyin,playadmin&' \
'tempid=4&keyboard=' + quote(self.keyboard, encoding='gb2312')
result_search = requests.get(url=url, headers=self.headers)
try:
tree_search = etree.HTML(result_search.text)
mh_url = ''.join(('http://m.pufei.net', tree_search.xpath('//*[@id="detail"]/li[1]/a/@href')[0]))
except IndexError as e:
logging.error(result_search.text)
logging.error(e)
return
result_chapter = requests.get(url=mh_url, headers=self.headers)
result_chapter.encoding = 'gb2312'
try:
tree_chapter = etree.HTML(result_chapter.text)
for index, (href, title) in enumerate(zip(tree_chapter.xpath('//*[@id="chapterList2"]/ul/li/a/@href')[::-1],
tree_chapter.xpath('//*[@id="chapterList2"]/ul/li/a/@title')[
::-1])):
chapter_url = ''.join(('http://m.pufei.net', href))
r_str = r'[\/:*?"<>|]'
title = re.sub(r_str, "", title)
dir_name = f"{index}_{title}"
dir_path = os.path.join(os.getcwd(), self.keyboard, dir_name)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
self.chapter(chapter_url, dir_path)
logging.info(f"{dir_path}, {chapter_url}")
except IndexError as e:
logging.error(result_chapter.text)
logging.error(e)
return
def chapter(self, chapter_url, dir_path):
result_chapter = requests.get(url=chapter_url, headers=self.headers)
result_chapter.encoding = 'gb2312'
cp = re.findall(r'cp="\w+.*"', result_chapter.text)[0][4:-1]
page_list = self.page_url_list(cp)
for index, value in enumerate(page_list):
page_url = "http://res.img.pufei.net/" + value
file_path = os.path.join(dir_path, f"{index}.jpg")
if not os.path.exists(file_path):
time.sleep(0.1)
threading.Thread(target=self.page, args=(page_url, file_path)).start()
def page(self, page_url, file_path):
try:
result_page = requests.get(url=page_url, headers=self.headers)
with open(file_path, 'wb') as fp:
fp.write(result_page.content)
logging.info(file_path)
except Exception as e:
logging.error(f"{page_url} - {e}")
@staticmethod
def page_url_list(cp):
js_code = '''
function base64decode(str) {
var base64EncodeChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
var base64DecodeChars = new Array(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1);
var c1, c2, c3, c4;
var i, len, out;
len = str.length;
i = 0;
out = "";
while (i < len) {
do {
c1 = base64DecodeChars[str.charCodeAt(i++) & 255]
} while (i < len && c1 == -1);
if (c1 == -1) {
break
}
do {
c2 = base64DecodeChars[str.charCodeAt(i++) & 255]
} while (i < len && c2 == -1);
if (c2 == -1) {
break
}
out += String.fromCharCode((c1 << 2) | ((c2 & 48) >> 4));
do {
c3 = str.charCodeAt(i++) & 255;
if (c3 == 61) {
return out
}
c3 = base64DecodeChars[c3]
} while (i < len && c3 == -1);
if (c3 == -1) {
break
}
out += String.fromCharCode(((c2 & 15) << 4) | ((c3 & 60) >> 2));
do {
c4 = str.charCodeAt(i++) & 255;
if (c4 == 61) {
return out
}
c4 = base64DecodeChars[c4]
} while (i < len && c4 == -1);
if (c4 == -1) {
break
}
out += String.fromCharCode(((c3 & 3) << 6) | c4)
}
return out
}
function geturl(cp) {
value = eval(eval(base64decode(cp).slice(4)));
return value
}
'''
js_context = execjs.compile(js_code)
return js_context.call('geturl', cp)
if __name__ == '__main__':
PuFei(input('请输入漫画名:')).search()
上一篇: 以30张漫画道别2019年
推荐阅读
-
pufei漫画网爬虫:完整代码分享
-
用JSP构建一个完整的新闻管理系统及源代码分享
-
Python爬虫实战(基础)-15获取东方财富网股票数据-写入csv(附完整代码)
-
ASC II 完整代码表及简介 - 欢迎转载本文。分享知识,造福人民,实现中华民族的伟大复兴!
-
Python 爬虫战斗(基本)-2 获取一首歌的歌词(附完整代码)
-
最新的 python 爬虫美国游销售完整代码
-
防爬汉芯城字体技巧说明 - 文章内所有内容仅为学习分享,非商用及违法目的,不包含完整代码、网络请求详情、特殊链接及数据接口等已做隐匿处理。切勿将其用于商业或非法途径,如因此产生的问题,作者概不负责!
-
新年烟火秀!Python版代码分享与完整源码
-
Python 3 如何绘制烟花效果?完整代码分享
-
华数杯国际数学建模B光伏电问题的完整解决方案及源代码分享 - 部分代码示例在文中,详细内容请查看附件!