Python]上海小区数据抓取和清洗(安居客、链家和房天下)
最编程
2024-06-28 21:24:11
...
一、前言:
安居客、链家和房天下是目前网上可以获取小区数据较为精准的网站,之前已经发过链家和房天下的部分区域(仅浦东)获取攻略。这次因为工作原因,需要获取整个上海的所有小区数据(仅别墅和住宅),所以过年这几天在不断的数据分析、获取、清洗和验证。特此记录一下,也把代码和各位分享。
二、爬取思路:
不管是安居客、链家还是房天下,获取数据的思路都是一致的:
1、获取不同行政区的网址
2、获取不同行政区下不同商圈/街镇的网址
3、获取不同行政区下每一个商圈/街镇中所有小区的网址
4、根据3中获得的网址,把需要的页面元素爬下来
三、安居客、房天下和链家对比:
我把三个网站的数据都爬下来了,不过最后只用了安居客的数据
四、链家代码
1 import requests 2 from bs4 import BeautifulSoup 3 import re 4 import time 5 import traceback 6 import math 7 8 headers = { 9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 10 'Host': 'sh.lianjia.com', 11 'Cookie': '' 12 } 13 14 def read_Lregion_dict(): 15 '''读取行政区域的文件,并输出为字典''' 16 with open('行政区url.txt', 'r') as f: 17 large_region_list = f.readlines() 18 large_region_dict = {} 19 for ele in large_region_list: 20 url, region = ele.split(' ') 21 region = region.replace('\n', '') 22 large_region_dict[url] = region 23 return large_region_dict 24 25 def get_jiezhen_urls(): 26 '''获取街镇的url''' 27 large_region_dict = read_Lregion_dict() 28 small_region_dict = {} 29 for k, v in large_region_dict.items(): 30 if v != '上海周边': 31 url = 'https://sh.lianjia.com' + k 32 r = requests.get(url=url, headers=headers) 33 soup = BeautifulSoup(r.text, 'lxml') 34 a = soup.find(name='div', attrs={'data-role': 'ershoufang'}) 35 esf_urls = a.find_all(name='a') 36 for ele in esf_urls: 37 href = ele.attrs['href'] 38 name = ele.string 39 if name in large_region_dict.values(): 40 continue 41 else: 42 small_region_dict[href] = name 43 with open('街镇url.txt', 'a', encoding='utf-8') as file: 44 file.write(','.join([v, name, href])) 45 file.write('\n') 46 print(v, name, href) 47 48 def region_total(url): 49 '''获取该区域的小区数量''' 50 url = r"https://sh.lianjia.com" + url + '?from=rec' 51 r = requests.get(url=url, headers=headers) 52 soup = BeautifulSoup(r.text, 'lxml') 53 total_find = soup.find(name='h2', attrs={'class': 'total fl'}) 54 total_num = int(total_find.find(name='span').string.strip()) 55 return total_num 56 57 def get_all_urls(): 58 '''获取所有小区名字和链接''' 59 with open('街镇url.txt', 'r', encoding='utf-8') as f: 60 small_region_list = f.readlines() 61 for ele in small_region_list: 62 l_region, s_region, url = ele.split(',') 63 url = url.replace('\n', '') 64 total_num = region_total(url) 65 pages = int(math.ceil(int(total_num)/30)) 66 for i in range(1, pages+1): 67 if i == 1: 68 i = "" 69 else: 70 i = 'pg' + str(i) 71 tmp_url = r"https://sh.lianjia.com" + url + i 72 r = requests.get(url=tmp_url, headers=headers) 73 soup = BeautifulSoup(r.text, 'lxml') 74 for j in soup.find_all(name='div', attrs={'class': 'title'}): 75 community = str(j) 76 if '''target="_blank"''' in community: 77 community_list = re.search('''<a href="(.*?)" target="_blank">(.*?)</a>.*?''', community) 78 community_url = community_list.group(1) 79 community_name = community_list.group(2) 80 with open('小区url.txt', 'a', encoding='utf-8') as file: 81 file.write(','.join([l_region, s_region, community_name, community_url])) 82 file.write('\n') 83 time.sleep(1) 84 print('{}, {}总共有{}个小区,共有{}页,已全部url爬取完成!'.format(l_region, s_region, total_num, pages)) 85 86 def get_communityInfo(l_region, s_region, community_name, community_url): 87 '''获取某个小区的信息''' 88 r = requests.get(url=community_url, headers=headers) 89 soup = BeautifulSoup(r.text, 'lxml') 90 try: 91 unitPrice = soup.find(name='span', attrs={'class': 'xiaoquUnitPrice'}).string #小区均价 92 except: 93 unitPrice = '空' 94 try: 95 address = soup.find(name='div', attrs={'class': 'detailDesc'}).string #小区地址 96 address = '"' + address + '"' 97 except: 98 address = '空' 99 try: 100 xiaoquInfo = soup.find_all(name='span', attrs={'class': 'xiaoquInfoContent'}) #小区信息 101 xiaoquInfo_list = [l_region, s_region] 102 community_name = '"' + community_name + '"' 103 xiaoquInfo_list.append(community_name) 104 xiaoquInfo_list.append(address) 105 xiaoquInfo_list.append(unitPrice) 106 for info in xiaoquInfo: 107 xiaoquInfo_list.append(info.string) 108 xiaoquInfo_list.pop() 109 export_communityInfo(xiaoquInfo_list) 110 time.sleep(1) 111 print('已爬取{},{}的{}信息'.format(l_region, s_region, community_name)) 112 except: 113 print('{},{}的{}爬取错误,url是{}'.format(l_region, s_region, community_name, community_url)) 114 115 def export_communityInfo(xiaoquInfo_list): 116 '''导出小区信息''' 117 with open('上海地区小区信息.txt', 'a', encoding='utf-8') as file: 118 file.write(','.join(xiaoquInfo_list)) 119 file.write('\n') 120 121 if __name__ == "__main__": 122 # get_jiezhen_urls() #获取街镇的url 123 # get_all_urls() #获取所有小区名字和链接 124 with open('小区url.csv', 'r') as f: 125 xiaoqu_list = f.readlines() 126 for ele in xiaoqu_list: 127 l_region, s_region, community_name, community_url = ele.split(',') 128 community_url = community_url.replace('\n', '') 129 try: 130 get_communityInfo(l_region, s_region, community_name, community_url) 131 except: 132 traceback.print_exc() 133 break
五、房天下代码
1 import requests 2 from bs4 import BeautifulSoup 3 import pandas as pd 4 import time 5 import traceback 6 7 headers = { 8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 9 'cookie': '''''' 10 } 11 12 def get_true_url(old_url): 13 '''获得正确的url''' 14 r = requests.get(url=old_url, headers=headers) 15 if r'<title>跳转...</title>' in r.text: 16 soup = BeautifulSoup(r.text, 'lxml') 17 new_url = soup.find(name='a', attrs={'class': 'btn-redir'}).attrs['href'] 18 return new_url 19 return old_url 20 21 def get_region_urls(): 22 '''获得上海行政区中不同街镇的url和名称''' 23 sh_dict = {'浦东': '25', '嘉定': '29', '宝山': '30', '闵行': '18', '松江': '586', '普陀': '28', 24 '静安': '21', '黄浦': '24', '虹口': '23', '青浦': '31', '奉贤': '32', '金山': '35', 25 '杨浦': '26', '徐汇': '19', '长宁': '20', '崇明': '996'} 26 for l_region_name, l_region_url in sh_dict.items(): 27 url = r"https://sh.esf.fang.com/housing/" + l_region_url + '__0_3_0_0_1_0_0_0/' 28 true_url = get_true_url(url) 29 r = requests.get(url=true_url, headers=headers) 30 soup = BeautifulSoup(r.text, 'lxml') 31 a = soup.find(name='p', attrs={'id': 'shangQuancontain', 'class': 'contain'}) 32 for i in a.find_all(name='a'): 33 if i.string != '不限': 34 this_url = r"https://sh.esf.fang.com" + i.attrs['href'] 35 this_url_list = get_region_url(this_url) 36 with open('上海地区街镇url.txt', 'a', encoding='utf-8') as file: 37 for tmp_url in this_url_list: 38 file.write(','.join([l_region_name, i.string, tmp_url])) 39 file.write('\n') 40 print('{}已完成'.format(l_region_name)) 41 42 def get_region_url(old_url): 43 '''获得这个区域的其它page_url''' 44 true_url = get_true_url(old_url) 45 r = requests.get(url=true_url, headers=headers) 46 soup = BeautifulSoup(r.text, 'lxml') 47 page_url = soup.find(name='div', attrs={'class': 'fanye gray6'}) 48 page_url_list = [] 49 page_url_list.append(old_url) 50 for j in page_url.find_all(name='a'): 51 if 'href' in j.attrs: 52 temp_url = r'https://sh.esf.fang.com/' + j.attrs['href'][1:] 53 if temp_url not in page_url_list: 54 page_url_list.append(temp_url) 55 return page_url_list 56 57 def get_xiaoqu_url(bigregion, smallregion, old_url): 58 '''获得某区域某一页的小区信息和url''' 59 true_url = get_true_url(old_url) 60 r = requests.get(url=true_url, headers=headers) 61 soup = BeautifulSoup(r.text, 'lxml') 62 j = 0 63 for i in soup.find_all(name='a', attrs={'class': 'plotTit', 'target': '_blank'}): 64 xiaoqu_type = soup.find('a', text=i.string, attrs={'class': 'plotTit', 'target': '_blank'}).parent.find('span', attrs={'class':'plotFangType'}).string 65 xiaoqu_name = i.string 66 xiaoqu_url = 'https://sh.esf.fang.com/' + i.attrs['href'][1:] 67 xiaoqu_url = xiaoqu_url.replace('.htm', '/housedetail.htm') 68 print(bigregion, smallregion, xiaoqu_name, xiaoqu_type, xiaoqu_url) 69 j += 1 70 with open('上海地区小区url.txt', 'a', encoding='utf-8') as file: 71 file.write(','.join([bigregion, smallregion, xiaoqu_name, xiaoqu_type, xiaoqu_url])) 72 file.write('\n') 73 time.sleep(1) 74 print(bigregion, smallregion, old_url, '所有小区url获取完毕,共有{}条数据'.format(j)) 75 print('-'*100) 76 77 def get_all_urls(last_url=None): 78 '''获得所有小区的URL''' 79 '''获得结果后还需要清洗一下,因为有些小区跨区域,所以会有重复''' 80 with open('上海地区街镇url.txt', 'r', encoding='utf-8') as f: 81 region_list = f.readlines() 82 event_tracking = False 83 for i in range(len(region_list)): 84 l_region, s_region, url = region_list[i].split(',') 85 url = url.replace('\n', '') 86 if last_url == url: 87 event_tracking = True 88 if event_tracking: 89 print(l_region, s_region, url) 90 get_xiaoqu_url(l_region, s_region, url) 91 92 def get_total_informations(l_region, s_region, community_name, community_type, community_url): 93 '''爬取某个小区的有用信息''' 94 r = requests.get(url=community_url, headers=headers) 95 soup = BeautifulSoup(r.text, 'lxml') 96 informations = soup.find(name='div', attrs={'class': 'village_info base_info'}) 97 if not informations: 98 print('