爬虫实战:Scrapy爬壳网二手房价格
Scrapy框架简介
构建scrapy工程
在桌面创建DataGet文件夹,进入cmd命令行(win下)
cd Desktop # cd到桌面
cd DataGet # cd到DataGet目录
scrapy startproject beikePrice # 创建爬虫工程
继续cd 到beikePrice下,可以看到模板代码文件,我们只要在这些模板文件上增加或者修改即可。
网站源代码解析
以西安的二手房房价为例,我们的起始页网站为: xa.ke.com/xiaoqu/
行政区信息解析
现在我们要按照行政区来爬取各区的房价数据,我们发现起始页面的顶部有按照行政区来分类的标签栏,我们Fn+F12查看网页源代码:
该行政区标签的网页源代码中存取了每个行政区的url,我们可以通过解析该信息获取每个行政区房价页面的url地址。
页面页数信息解析
我们点击进入碑林区的页面,然后依次进入第2页、第3页...,其url地址为:
xa.ke.com/xiaoqu/beil…
xa.ke.com/xiaoqu/beil…
因此url地址和页面的关系我们也发现了,那还有一个问题每个行政区的房价页面有多少页呢?这个信息能否在网页源代码中找到呢。
我们继续查看网页源代码:
鼠标定位到底部页面栏,我们可以很方便的找到页面信息:
page-data="{"total_page":39,"curPage":2}"
发现当前页面信息和总页面信息都是存储在网页源代码中的。
房价信息解析
也可以很方便的定位到房价信息对应的源代码,使用xpath即可进行解析。
爬取思路总结
1.爬取的起始url:xa.ke.com/xiaoqu/
2.解析起始url获取各个行政区的房价url地址
3.对每一个行政区的房价url进行解析,得到其总页面数
4.将url和页面序列数进行拼接得到最终的url,请求,解析获得数据
5.使用高德接口的地理编码服务获取小区的经纬度信息
代码
数据爬取
爬虫代码beike.py
文件:
import time
import scrapy
from beikePrice.items import BeikepriceItem
from lxml import etree
from inline_requests import inline_requests
from fake_useragent import UserAgent
from scrapy.selector import Selector
from lxml import html
class BeikeSpider(scrapy.Spider):
name = 'beike'
# allowed_domains = ['beike.com']
allowed_domains = ['bj.ke.com']
start_urls = ['https://xa.ke.com/xiaoqu/']
headers = {"User-Agent": UserAgent().random,
"Cookie": 'lianjia_uuid=b4e400f9-afdb-4906-a392-11a197b04320; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1662636442; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221831cdac35e21a-0bdae5f108cd37-26021c51-1024000-1831cdac35f413%22%2C%22%24device_id%22%3A%221831cdac35e21a-0bdae5f108cd37-26021c51-1024000-1831cdac35f413%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; select_city=610100; lianjia_ssid=b1173161-ac78-4180-b042-0f31a3dc4bf0; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1662854380; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiNDBlMWM2N2Y2MjIzNDRlMjMyODQzZDQwNDZmMTQyZWY5Yzc4YjQ4MjI3NGE0ZmU2ZjFjMjc4ODJiNDc2YzE0YzgyMTE2NmI3NmNlNGM4ODEyNTM1MDBlYmM5ZTY1NGViODBiNmNiNTYzYmNlNGQ2NmM0ZmUzODQwZGEzZjc5NWM0OThjZjJjZmExMjVjYWJmZDgzZmJhODRiN2EwM2YyZmY0ODg3YTc3YWJlNDFmZDc0ZGM5MGY1MWE0Y2U3Y2NmZjhkNDY2YTc0YjI3ZTBhZTY5M2ViMzlhMDM1MTliMWYwZmNmNWE0MWMyNWM1YzRjMTI2MjhhN2ZmZjlhMWZjNlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJkMmMxOTIyMFwifSIsInIiOiJodHRwczovL3hhLmtlLmNvbS94aWFvcXUvIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=',
"Referer": "https://xa.ke.com/xiaoqu/weiyang/"}
classItems = {"province": "陕西省", "city": "西安市"}
# 从首页开始爬取
def start_requests(self):
page = 1
url = self.start_urls[0].format(page) # 构造请求体
yield scrapy.Request(url=url, headers=self.headers, dont_filter=True) # dont_filter=True, meta={"page": page})
@inline_requests
def parse(self, response):
# 获取各个行政区房价的url
res = response
district_list = res.xpath("//div[@data-role="ershoufang"]/div/a/@href").extract()
start_url_list = ["".join(["https://xa.ke.com", i]) for i in district_list]
print(start_url_list)
# 构造一个爬取项
items = BeikepriceItem()
items.update(self.classItems)
# 遍历每个行政区房价页面的第一页
for i in range(len(start_url_list)):
url = start_url_list[i]
print(url)
res1 = yield scrapy.Request(url=url, headers=self.headers, dont_filter=True)
# 判断有多少页, 并且凭借url
# page_info: {"totalPage":39,"curPage":1}
page_info = res1.xpath("//div[@class="page-box fr"]/div/@page-data").get()
total_page_num = int(page_info.split("":")[1].split(","")[0])
time.sleep(1)
# 拼接页数
url_list = ["".join([url, "pg", str(j)]) for j in range(1, total_page_num + 1)]
for k in range(len(url_list)):
print(url_list[k])
res3 = yield scrapy.Request(url=url_list[k], headers=self.headers, dont_filter=True)
# 核心解析代码
res3_xpath = res3.xpath("//ul[@class="listContent"]/li")
for rx in res3_xpath:
# Selector
rx1 = etree.HTML(rx.get())
# 小区名称
items["projectName"] = rx1.xpath("//div[1]/div[1]/a[@class="maidian-detail"]/text()")[0]
# 小区房价
try:
items["referencePrice"] = rx1.xpath("//div[2]/div[1]/div[@class="totalPrice"]/span/text()")[0]
except:
items["referencePrice"] = "--"
# 行政区
items["region"] = rx1.xpath("//div[1]/div[3]/a[position()<2]/text()")[0] + "区"
# 更新时间
items["updateDate"] = "2022-0" + \
rx1.xpath("//div[2]/div[1]/div[@class="priceDesc"]/text()")[0].split("月")[
0] # .get()
yield items
items.py
文件:
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.item import Item, Field
class BeikepriceItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
projectName = Field()
referencePrice = Field()
region = Field()
updateDate = Field()
province = Field()
city = Field()
pipelines.py
文件:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from .items import BeikepriceItem
import pandas as pd
class BeikepricePipeline:
def open_spider(self, spider):
"""
爬虫一旦开启,就会实现这个方法,连接到数据库
"""
pass
# self.f = open('data.txt', 'w')
def close_spider(self, spider):
"""
爬虫一旦关闭,就会实现这个方法,关闭数据库连接
"""
pass
# self.f.close()
def process_item(self, items, spider):
items1 = ItemAdapter(items).asdict()
postItem = dict(items1) # 把item转化成字典形式
df = pd.DataFrame({'projectName': [postItem['projectName']],
'referencePrice': [postItem['referencePrice']],
"region": [postItem["region"]],
"updateDate": [postItem["updateDate"]]
})
print(df)
df.to_csv(r'df_ershoufang.csv', encoding='utf_8_sig', index=False, mode='a', header=False)
return items1
settings.py
文件:
修改以下配置项即可:
BOT_NAME = 'beikePrice'
SPIDER_MODULES = ['beikePrice.spiders']
NEWSPIDER_MODULE = 'beikePrice.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'beikePrice.pipelines.BeikepricePipeline': 300,
}
命令行切换到对应的爬虫工程目录下:执行scrapy crawl beike
即可执行爬虫。
获得位置数据
给获取到的小区添加经纬度数据,使用高德的API接口使用地理编码服务,输入结构化的地址名称,接口返回经纬度数据。
import requests
import pandas as pd
import json
import numpy as np
def get_loc():
api_url = 'https://restapi.amap.com/v3/geocode/geo'
# 写进自己的key, 由于请求条数较多, 可能超配额了, 所以用了两个key
# key_list = ['你的第一个key', '你的第二个key']
# 读取爬取下来的房价数据
taz_df = pd.read_csv(r'../df_ershoufang.csv')
lat_list = []
lng_list = []
for _, row in taz_df.iterrows():
address_name = '陕西省西安市' + row['district'] + row['name']
_i = np.random.randint(0, 2)
para_dict = {'address': address_name, 'key': key_list[_i], 'output': 'JSON'}
try:
r = requests.get(api_url, params=para_dict)
json_data = json.loads(r.text)
if json_data['status'] == '1':
geo = json_data['geocodes'][0]['location']
longitude = geo.split(',')[0]
latitude = geo.split(',')[1]
else:
longitude = np.nan
latitude = np.nan
except:
longitude = np.nan
latitude = np.nan
continue
lat_list.append(latitude)
lng_list.append(longitude)
print(_i)
print(longitude, latitude)
loc_df = pd.DataFrame({'lng': lng_list, 'lat': lat_list})
taz_df = pd.concat([taz_df, loc_df], axis=1)
taz_df.to_csv(r'house_price.csv', encoding='utf_8_sig', index=False)
推荐阅读