欢迎您访问 最编程 本站为您分享编程语言代码,编程技术文章!
您现在的位置是: 首页

爬虫,爬取链家网站上的北京二手房信息。

最编程 2024-06-03 17:24:18
...
# 链家网二手房信息爬取
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = 'http://bj.lianjia.com/ershoufang/'
page = ('pg')
#设置请求头部信息
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding':'gzip',
'Connection':'close',
'Referer':'http://www.baidu.com/link?url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&wd=&eqid=c3435a7d00146bd600000003582bfd1f'
}

#循环抓取列表页信息
for i in range(1, 3):
     if i == 1:
          i = str(i)
          a = (url + page + i + '/')
          r = requests.get(url = a, headers = headers)
          html = r.content
     else:
          i = str(i)
          a = (url + page + i + '/')
          r = requests.get(url = a,headers = headers)
          html2 = r.content
          html = html + html2
     #每次间隔1秒
     time.sleep(0.5)

#解析抓取的页面内容
lj = BeautifulSoup(html,'html.parser')

clears = lj.find_all('li', attrs = {'class':'clear'})
houseInfo = []
guanzhuInfo = []
daikanInfo = []
timeInfo = []
subwayInfo = []
positionInfo = []
totalpriceInfo = []

for clear in clears:
     houseInfo.append(clear.find('div', 'houseInfo').get_text())

     guanzhuInfo.append(clear.find(text = re.compile('人关注'))) #很关键,哈哈哈
     daikanInfo.append(clear.find(text = re.compile('次带看')))

     timeInfo.append(clear.find('div', 'timeInfo').get_text())

     temp = clear.find('span', 'subway')
     if temp is None:
          subwayInfo.append('')
     else:
          subwayInfo.append(temp.get_text())

     positionInfo.append(clear.find('div', 'positionInfo').get_text())
     totalpriceInfo.append(clear.find('div', 'totalPrice').get_text())

data = {
     'houseInfo': houseInfo, 
     'guanzhuInfo':guanzhuInfo, 'daikanInfo':daikanInfo,
     'timeInfo':timeInfo, 'subwayInfo':subwayInfo, 'positionInfo':positionInfo, 'totalpriceInfo':totalpriceInfo
}
frame = pd.DataFrame(data, columns = ['houseInfo', 'guanzhuInfo', 'daikanInfo', 'timeInfo', 'subwayInfo', 'positionInfo', 'totalpriceInfo'])
# print(frame.head())

# split
houseInfo_split = pd.DataFrame((x.split('/') for x in frame.houseInfo), index = frame.index, 
     columns = ['xiaoqu', 'huxing', 'mianji', 'chaoxiang', 'zhuangxiu', 'dianti'])
positionInfo_split = pd.DataFrame((x.split('/') for x in frame.positionInfo), index = frame.index,
     columns = ['ceng', 'nian', 'position'])

# merge
del frame['houseInfo']
del frame['positionInfo']
split = pd.merge(houseInfo_split, positionInfo_split, right_index = True, left_index = True)
house = pd.merge(frame, split, right_index = True, left_index = True)
print(house.head())
house.to_csv('house.csv')

推荐阅读