欢迎您访问 最编程 本站为您分享编程语言代码,编程技术文章!
您现在的位置是: 首页

Python]上海小区数据抓取和清洗(安居客、链家和房天下)

最编程 2024-06-28 21:24:11
...

一、前言:
安居客、链家和房天下是目前网上可以获取小区数据较为精准的网站,之前已经发过链家和房天下的部分区域(仅浦东)获取攻略。这次因为工作原因,需要获取整个上海的所有小区数据(仅别墅和住宅),所以过年这几天在不断的数据分析、获取、清洗和验证。特此记录一下,也把代码和各位分享。

二、爬取思路:
不管是安居客、链家还是房天下,获取数据的思路都是一致的:
1、获取不同行政区的网址
2、获取不同行政区下不同商圈/街镇的网址
3、获取不同行政区下每一个商圈/街镇中所有小区的网址
4、根据3中获得的网址,把需要的页面元素爬下来

三、安居客、房天下和链家对比:

我把三个网站的数据都爬下来了,不过最后只用了安居客的数据

四、链家代码

  1 import requests
  2 from bs4 import BeautifulSoup
  3 import re
  4 import time
  5 import traceback
  6 import math
  7 
  8 headers = {
  9     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
 10     'Host': 'sh.lianjia.com',
 11     'Cookie': ''
 12 }
 13 
 14 def read_Lregion_dict():
 15     '''读取行政区域的文件,并输出为字典'''
 16     with open('行政区url.txt', 'r') as f:
 17         large_region_list = f.readlines()
 18     large_region_dict = {}
 19     for ele in large_region_list:
 20         url, region = ele.split(' ')
 21         region = region.replace('\n', '')
 22         large_region_dict[url] = region
 23     return large_region_dict
 24 
 25 def get_jiezhen_urls():
 26     '''获取街镇的url'''
 27     large_region_dict = read_Lregion_dict()
 28     small_region_dict = {}
 29     for k, v in large_region_dict.items():
 30         if v != '上海周边':
 31             url = 'https://sh.lianjia.com' + k
 32             r = requests.get(url=url, headers=headers)
 33             soup = BeautifulSoup(r.text, 'lxml')
 34             a = soup.find(name='div', attrs={'data-role': 'ershoufang'})
 35             esf_urls = a.find_all(name='a')
 36             for ele in esf_urls:
 37                 href = ele.attrs['href']
 38                 name = ele.string
 39                 if name in large_region_dict.values():
 40                     continue
 41                 else:
 42                     small_region_dict[href] = name
 43                     with open('街镇url.txt', 'a', encoding='utf-8') as file:
 44                         file.write(','.join([v, name, href]))
 45                         file.write('\n')
 46                     print(v, name, href)
 47 
 48 def region_total(url):
 49     '''获取该区域的小区数量'''
 50     url = r"https://sh.lianjia.com" + url + '?from=rec'
 51     r = requests.get(url=url, headers=headers)
 52     soup = BeautifulSoup(r.text, 'lxml')
 53     total_find = soup.find(name='h2', attrs={'class': 'total fl'})
 54     total_num = int(total_find.find(name='span').string.strip())
 55     return total_num
 56 
 57 def get_all_urls():
 58     '''获取所有小区名字和链接'''
 59     with open('街镇url.txt', 'r', encoding='utf-8') as f:
 60         small_region_list = f.readlines()
 61     for ele in small_region_list:
 62         l_region, s_region, url = ele.split(',')
 63         url = url.replace('\n', '')
 64         total_num = region_total(url)
 65         pages = int(math.ceil(int(total_num)/30))
 66         for i in range(1, pages+1):
 67             if i == 1:
 68                 i = ""
 69             else:
 70                 i = 'pg' + str(i)
 71             tmp_url = r"https://sh.lianjia.com" + url + i
 72             r = requests.get(url=tmp_url, headers=headers)
 73             soup = BeautifulSoup(r.text, 'lxml')
 74             for j in soup.find_all(name='div', attrs={'class': 'title'}):
 75                 community = str(j)
 76                 if '''target="_blank"''' in community:
 77                     community_list = re.search('''<a href="(.*?)" target="_blank">(.*?)</a>.*?''', community)
 78                     community_url = community_list.group(1)
 79                     community_name = community_list.group(2)
 80                     with open('小区url.txt', 'a', encoding='utf-8') as file:
 81                         file.write(','.join([l_region, s_region, community_name, community_url]))
 82                         file.write('\n')
 83             time.sleep(1)
 84         print('{}, {}总共有{}个小区,共有{}页,已全部url爬取完成!'.format(l_region, s_region, total_num, pages))
 85 
 86 def get_communityInfo(l_region, s_region, community_name, community_url):
 87     '''获取某个小区的信息'''
 88     r = requests.get(url=community_url, headers=headers)
 89     soup = BeautifulSoup(r.text, 'lxml')
 90     try:
 91         unitPrice = soup.find(name='span', attrs={'class': 'xiaoquUnitPrice'}).string #小区均价
 92     except:
 93         unitPrice = ''
 94     try:
 95         address = soup.find(name='div', attrs={'class': 'detailDesc'}).string #小区地址
 96         address = '"' + address + '"'
 97     except:
 98         address = ''
 99     try:
100         xiaoquInfo = soup.find_all(name='span', attrs={'class': 'xiaoquInfoContent'}) #小区信息
101         xiaoquInfo_list = [l_region, s_region]
102         community_name = '"' + community_name + '"'
103         xiaoquInfo_list.append(community_name)
104         xiaoquInfo_list.append(address)
105         xiaoquInfo_list.append(unitPrice)
106         for info in xiaoquInfo:
107             xiaoquInfo_list.append(info.string)
108         xiaoquInfo_list.pop()
109         export_communityInfo(xiaoquInfo_list)
110         time.sleep(1)
111         print('已爬取{},{}的{}信息'.format(l_region, s_region, community_name))
112     except:
113         print('{},{}的{}爬取错误,url是{}'.format(l_region, s_region, community_name, community_url))
114 
115 def export_communityInfo(xiaoquInfo_list):
116     '''导出小区信息'''
117     with open('上海地区小区信息.txt', 'a', encoding='utf-8') as file:
118         file.write(','.join(xiaoquInfo_list))
119         file.write('\n')
120 
121 if __name__ == "__main__":
122     # get_jiezhen_urls() #获取街镇的url
123     # get_all_urls() #获取所有小区名字和链接
124     with open('小区url.csv', 'r') as f:
125         xiaoqu_list = f.readlines()
126         for ele in xiaoqu_list:
127             l_region, s_region, community_name, community_url = ele.split(',')
128             community_url = community_url.replace('\n', '')
129             try:
130                 get_communityInfo(l_region, s_region, community_name, community_url)
131             except:
132                 traceback.print_exc()
133                 break
View Code

 

五、房天下代码

  1 import requests
  2 from bs4 import BeautifulSoup
  3 import pandas as pd
  4 import time
  5 import traceback
  6 
  7 headers = {
  8     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
  9     'cookie': ''''''
 10 }
 11 
 12 def get_true_url(old_url):
 13     '''获得正确的url'''
 14     r = requests.get(url=old_url, headers=headers)
 15     if r'<title>跳转...</title>' in r.text:
 16         soup = BeautifulSoup(r.text, 'lxml')
 17         new_url = soup.find(name='a', attrs={'class': 'btn-redir'}).attrs['href']
 18         return new_url
 19     return old_url
 20 
 21 def get_region_urls():
 22     '''获得上海行政区中不同街镇的url和名称'''
 23     sh_dict = {'浦东': '25', '嘉定': '29', '宝山': '30', '闵行': '18', '松江': '586', '普陀': '28',
 24                '静安': '21', '黄浦': '24', '虹口': '23', '青浦': '31', '奉贤': '32', '金山': '35',
 25                '杨浦': '26', '徐汇': '19', '长宁': '20', '崇明': '996'}
 26     for l_region_name, l_region_url in sh_dict.items():
 27         url = r"https://sh.esf.fang.com/housing/" + l_region_url + '__0_3_0_0_1_0_0_0/'
 28         true_url = get_true_url(url)
 29         r = requests.get(url=true_url, headers=headers)
 30         soup = BeautifulSoup(r.text, 'lxml')
 31         a = soup.find(name='p', attrs={'id': 'shangQuancontain', 'class': 'contain'})
 32         for i in a.find_all(name='a'):
 33             if i.string != '不限':
 34                 this_url = r"https://sh.esf.fang.com" + i.attrs['href']
 35                 this_url_list = get_region_url(this_url)
 36                 with open('上海地区街镇url.txt', 'a', encoding='utf-8') as file:
 37                     for tmp_url in this_url_list:
 38                         file.write(','.join([l_region_name, i.string, tmp_url]))
 39                         file.write('\n')
 40         print('{}已完成'.format(l_region_name))
 41 
 42 def get_region_url(old_url):
 43     '''获得这个区域的其它page_url'''
 44     true_url = get_true_url(old_url)
 45     r = requests.get(url=true_url, headers=headers)
 46     soup = BeautifulSoup(r.text, 'lxml')
 47     page_url = soup.find(name='div', attrs={'class': 'fanye gray6'})
 48     page_url_list = []
 49     page_url_list.append(old_url)
 50     for j in page_url.find_all(name='a'):
 51         if 'href' in j.attrs:
 52             temp_url = r'https://sh.esf.fang.com/' + j.attrs['href'][1:]
 53             if temp_url not in page_url_list:
 54                 page_url_list.append(temp_url)
 55     return page_url_list
 56 
 57 def get_xiaoqu_url(bigregion, smallregion, old_url):
 58     '''获得某区域某一页的小区信息和url'''
 59     true_url = get_true_url(old_url)
 60     r = requests.get(url=true_url, headers=headers)
 61     soup = BeautifulSoup(r.text, 'lxml')
 62     j = 0
 63     for i in soup.find_all(name='a', attrs={'class': 'plotTit', 'target': '_blank'}):
 64         xiaoqu_type = soup.find('a', text=i.string, attrs={'class': 'plotTit', 'target': '_blank'}).parent.find('span', attrs={'class':'plotFangType'}).string
 65         xiaoqu_name = i.string
 66         xiaoqu_url = 'https://sh.esf.fang.com/' + i.attrs['href'][1:]
 67         xiaoqu_url = xiaoqu_url.replace('.htm', '/housedetail.htm')
 68         print(bigregion, smallregion, xiaoqu_name, xiaoqu_type, xiaoqu_url)
 69         j += 1
 70         with open('上海地区小区url.txt', 'a', encoding='utf-8') as file:
 71             file.write(','.join([bigregion, smallregion, xiaoqu_name, xiaoqu_type, xiaoqu_url]))
 72             file.write('\n')
 73     time.sleep(1)
 74     print(bigregion, smallregion, old_url, '所有小区url获取完毕,共有{}条数据'.format(j))
 75     print('-'*100)
 76 
 77 def get_all_urls(last_url=None):
 78     '''获得所有小区的URL'''
 79     '''获得结果后还需要清洗一下,因为有些小区跨区域,所以会有重复'''
 80     with open('上海地区街镇url.txt', 'r', encoding='utf-8') as f:
 81         region_list = f.readlines()
 82         event_tracking = False
 83         for i in range(len(region_list)):
 84             l_region, s_region, url = region_list[i].split(',')
 85             url = url.replace('\n', '')
 86             if last_url == url:
 87                 event_tracking = True
 88             if event_tracking:
 89                 print(l_region, s_region, url)
 90                 get_xiaoqu_url(l_region, s_region, url)
 91 
 92 def get_total_informations(l_region, s_region, community_name, community_type, community_url):
 93     '''爬取某个小区的有用信息'''
 94     r = requests.get(url=community_url, headers=headers)
 95     soup = BeautifulSoup(r.text, 'lxml')
 96     informations = soup.find(name='div', attrs={'class': 'village_info base_info'})
 97     if not informations:
 98         print('