欢迎您访问 最编程 本站为您分享编程语言代码,编程技术文章!
您现在的位置是: 首页

养老院数据爬虫

最编程 2024-03-18 19:09:06
...

养老院数据爬虫

时间:2019-01-15
本文章向大家介绍养老院数据爬虫,主要包括养老院数据爬虫使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。
import requests
from lxml import etree
import csv
import threading
from queue import Queue
import time

parse_count = 1
crawl_fail_list = []
parse_fail_list = []
#http://www.yanglao.com.cn/resthome_2
class crawl_thread(threading.Thread):

	def __init__(self, name, page_queue, data_queue):
		super().__init__()
		self.name = name
		self.page_queue = page_queue
		self.data_queue =data_queue

	def run(self):
		global crawl_fail_list
		print("*********%s开始************" % self.name)
		while 1:
			#如果page-queue空就终止线程
			if self.page_queue.empty():
				break
			#从页码池获取数据,拼接url
			try:
				page = self.page_queue.get()
				url = 'http://www.yanglao.com.cn/resthome_' + str(page)
				headers = {
				'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
				}
				#发送请求,获得响应
				r = requests.get(url, headers=headers)
				#将响应放入数据队列
				self.data_queue.put(r.text)
				print('%s:第%s页爬取完成' % (self.name, page))
				time.sleep(0.3)
			except Exception as e:
				print(e)
				crawl_fail_list.appen(page)
		print("*********%s结束************" % self.name)

class parse_thread(threading.Thread):
	
	def __init__(self, name, data_queue, suo, writer):
		super().__init__()
		self.name = name
		self.data_queue = data_queue
		self.suo = suo
		self.writer = writer

	def run(self):
		global parse_count
		global parse_fail_list
		print("*********%s开始************" % self.name)
		while 1:
			#从数据队列获得数据,如果超过30s没有新数据就终止
			try:
				content = self.data_queue.get(True,15)
			except:
				break
			#解析数据
			try:
				tree = etree.HTML(content)
				li_list = tree.xpath('//li[@class="rest-item"]')
				for li in li_list:
					name = li.xpath('.//h4/a/text()')[0]
					location = li.xpath('.//ul/li[1]/text()')[0].replace('地址:','')
					beds = li.xpath('.//ul/li[2]/text()')[0].replace('床位数:','').replace('张','')
					money = li.xpath('.//ul/li[3]/text()')[0].replace('收费区间:','')
					lt = [name, location, beds, money]
					#上锁写csv
					self.suo.acquire()
					self.writer.writerow(lt)
					self.suo.release()
				print("%s:第%s页写入完成" %(self.name, parse_count))
			#如果解析失败就抛出错误,继续循环
			except Exception as e:
				print(e)
				parse_fail_list.append(parse_count)
			parse_count += 1
		print("*********%s结束************" % self.name)

##################################################################
def create_queue():
	#创建页码队列
	page_queue = Queue()
	#总页数
	for page in range(1, 1676):
		page_queue.put(page)
	#创建数据队列
	data_queue = Queue()
	return page_queue, data_queue

def create_crawl_list(page_queue, data_queue):
	crawl_list = []
	name_list = ['爬虫1号', '爬虫2号']
	for name in name_list:
		crawl = crawl_thread(name, page_queue, data_queue)
		crawl_list.append(crawl)
	return crawl_list

def create_parse_list(data_queue, suo, writer):
	parse_list = []
	name_list = ['解析1号', '解析2号']
	for name in name_list:
		parse = parse_thread(name, data_queue, suo, writer)
		parse_list.append(parse)
	return parse_list

###################################################
def main():
	#创建队列
	page_queue, data_queue = create_queue()
	#创建锁
	suo = threading.Lock()
	#打开文件,创建writer
	f = open('养老院数据_全.csv', 'a', encoding='utf8', newline='')
	writer = csv.writer(f)
	#创建爬虫队列和解析队列
	crawl_list = create_crawl_list(page_queue, data_queue)
	parse_list = create_parse_list(data_queue, suo, writer)
	print(crawl_list, parse_list)
	#启动爬虫
	for crawl in crawl_list:
		crawl.start()
	for parse in parse_list:
		parse.start()
	#确保主线程最后关闭
	for crawl in crawl_list:
		crawl.join()
	for parse in parse_list:
		parse.join()
	#收尾
	f.close()
	print('所有线程关闭,程序结束!!!')
	print(crawl_fail_list)
	print(parse_fail_list)


if __name__ == '__main__':
	main()