如何使用Python从微博抓取数据并将其存储到MySQL或Excel文件中
最编程
2024-02-11 15:32:24
...
寒假期间做微博数据统计,由于是部门要求,也没办法,自己又是一个懒人,直接用python实现吧。写的很粗糙,也很差,请大家别介意。
总的来说分成两个模块:一个爬取微博数据存入数据库,一个从数据库调取数据放入excel表格中。还有一个config文件,由于每周值班人员几乎都在变动以及日期也都是从周一到周日,由于某些原因不能做实时数据统计,所以需要config文件设置代码中的部分情况。注意我在mysql中是提前建立表的,而我放假回来后,莫名其妙mysql某个配置文件丢失,导致我花了很久才重装了,数据也都丢失了。
平心而论,代码很简单,但值得注意的是要关注爬取数据中的不同点:
微博数据客户端与手机端获取数据有差,有部分数据丢失。
import requests import time import pymysql import re from pyquery import PyQuery as py db = pymysql.connect('localhost', 'root', '123456', 'hfut_weibo',use_unicode=True, charset="utf8") cursor = db.cursor() cursor.execute('SET NAMES utf8;') cursor.execute('SET CHARACTER SET utf8;') cursor.execute('SET character_set_connection=utf8;') headers = {'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36'} def mysql(date,attitudes_count,comments_count, reposts_count, index,title, data): data = transferContent(data) title = transferContent(title) comtent = ( '#冰糖说英语#', '#冰糖点歌台#', '#冰糖小百科#', '#冰糖的树洞#', '#冰糖么么答#', '#冰糖赠书#', '#冰糖书影音#', '#老照片#', '#直播#', '#送福利#') for com in comtent: parttern = re.compile(com) match = re.search(parttern,title) if match: title = match.group() print(data) sql = ''' INSERT INTO inital_weibo_data_special VALUES ("%s",'%d','%d','%d','%f',"%s","%s") ''' % (date, attitudes_count, comments_count, reposts_count, index, title, data) cursor.execute(sql) db.commit() return sql = ''' INSERT INTO inital_weibo_data VALUES ('%s',"%d","%d",'%d','%f',"%s","%s") ''' % (date, attitudes_count, comments_count, reposts_count, index, title, data) cursor.execute(sql) db.commit() return def transferContent( content): string = "" for c in content: if c == '"': string += '\\\"' elif c == "'": string += "\\\'" else: string += c return string def error_com(data): # comtent = ('#冰糖说英语#','#冰糖点歌台#','#冰糖小百科#','#冰糖的树洞#','#冰糖么么答#','#冰糖赠书#','#冰糖书影音#','#老照片#','#直播#','#送福利#') # # for com in comtent: # for com in comtent: # parttern = re.compile(com) # martch = re.search(parttern,data) # # if martch: # print(martch.string) # return martch.string doc = py(data) return doc.find('a').text() def data_store(): Q = 0 for i in range(1,30): time.sleep(0.5) url = "https://m.weibo.cn/api/container/getIndex?uid=2348668214&luicode=10000011&lfid=100103type%3D1%26q%3D%E5%90%88%E8%82%A5%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6&featurecode=20000320&type=uid&value=2348668214&containerid=1076032348668214&page={num}".format(num = i) r_1 = requests.get( url, headers=headers) try: if i == 1: for j in range(2, 11): time.sleep(0.05) page_1 = r_1.json()['data']['cards'][j]['mblog'] date = page_1['created_at'] attitudes_count = page_1['attitudes_count'] comments_count = page_1['comments_count'] reposts_count = page_1['reposts_count'] data = page_1['text'] index = attitudes_count + comments_count*0.5 + reposts_count*2 doc = py(data) title = doc.find('a').text() print(page_1['text']) print(page_1['attitudes_count']) print(page_1['comments_count']) print(page_1['reposts_count']) print(index) print(page_1['created_at']) print(title) try: mysql(date, attitudes_count, comments_count, reposts_count, index, title, data) except: # title = page_1['page_info']['page_title'] # mysql(date, attitudes_count, comments_count, reposts_count, index, title) # print('本次插入错误,继续下一次') data = title mysql(date, attitudes_count, comments_count, reposts_count, index, title, data) continue else: print(r_1.json()['data']['cards']) for j in range(0,10): print(url) time.sleep(0.05) page_1 = r_1.json()['data']['cards'][j]['mblog'] date = page_1['created_at'] attitudes_count = page_1['attitudes_count'] comments_count = page_1['comments_count'] reposts_count = page_1['reposts_count'] data = page_1['text'] index = attitudes_count + comments_count * 0.5 + reposts_count * 2 doc = py(data) title = doc.find('a').text() print(j) print(page_1['text']) print(page_1['attitudes_count']) print(page_1['comments_count']) print(page_1['reposts_count']) print(index) print(page_1['created_at']) try: mysql(date, attitudes_count, comments_count, reposts_count, index, title, data) except: print(11111111111111111111111111111111111111111111111111111111111111111111111111111111) # title = page_1['page_info']['page_title'] # mysql(date, attitudes_count, comments_count, reposts_count, index, title) data = title mysql(date, attitudes_count, comments_count, reposts_count, index, title, data) continue except: Q = Q +1 continue print(Q) def main(): data_store() if __name__ == '__main__': main() db.close()
import xlsxwriter import pymysql ################################################################## from config_155 import * ################################################################## db = pymysql.connect('localhost', 'root', '123456', 'hfut_weibo',use_unicode=True, charset="utf8") cursor = db.cursor() cursor.execute('SET NAMES utf8;') cursor.execute('SET CHARACTER SET utf8;') cursor.execute('SET character_set_connection=utf8;') workbook = xlsxwriter.Workbook("E:\合肥工业大学微博值班数据\%s.xlsx" % document_name) worksheet1 = workbook.add_worksheet("日常值班") worksheet1.write(0, 0,'日期') worksheet1.write(0, 1, '值班人') worksheet1.write(0, 2, '转发数') worksheet1.write(0, 3, '评论数') worksheet1.write(0, 4, '点赞数') worksheet1.write(0, 5, '指标数') worksheet2 = workbook.add_worksheet("特色栏目") worksheet2.write(0, 0,'栏目') worksheet2.write(0, 1, '负责人') worksheet2.write(0, 2, '转发数') worksheet2.write(0, 3, '评论数') worksheet2.write(0, 4, '点赞数') worksheet2.write(0, 5, '指标数') def mysql_read(time,name,i,sig): if sig == 1: sql=''' SELECT * FROM inital_weibo_data WHERE date='%s' ''' % time else: sql = ''' SELECT * FROM inital_weibo_data_special WHERE title='%s' AND date BETWEEN '%s' AND '%s' ''' % (time,timt_start,time_conclude) return sql def excel_work(sql,i,time,name,worksheet): try: # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 results = cursor.fetchall() print(results) worksheet.write(i, 0, time) worksheet.write(i, 1, name) index_all = 0 attitudes_count_all = 0 comments_count_all = 0 reposts_count_all = 0 for row in results: date = row[0] attitudes_count = row[1] comments_count = row[2] reposts_count = row[3] index = row[4] index_all += index attitudes_count_all += attitudes_count comments_count_all += comments_count reposts_count_all += reposts_count worksheet.write(i, 2, reposts_count) worksheet.write(i, 3, comments_count) worksheet.write(i, 4, attitudes_count) worksheet.write(i, 5, index) i = i + 1 # 打印结果 print(date, attitudes_count, comments_count, reposts_count, index) worksheet.write(i, 2, reposts_count_all) worksheet.write(i, 3, comments_count_all) worksheet.write(i, 4, attitudes_count_all) worksheet.write(i, 5, index_all) return i + 2 except: print("Error: unable to fecth data") def main(): i = 1 for (k, v) in work_1.items(): time = k name = v sql = mysql_read(time, name, i, 1) i = excel_work(sql,i,time,name,worksheet1) i=1 for (k, v) in work_0.items(): time = k name = v sql = mysql_read(time, name, i, 0) i = excel_work(sql, i, time, name, worksheet2) if __name__ == '__main__': main() db.close() workbook.close()
work = {'星期一':'张壮','星期二':'王修齐','星期三':'徐露露','星期四':'张雪辰','星期五':'孙艳丽','星期六':'马玉龙','星期日':'陈之羽',} work_0 = \ {'#冰糖说英语#':'彭二丹','#冰糖点歌台#':'董令','#冰糖小百科#':'周成旭','#冰糖的树洞#':'张雪辰','#冰糖么么答#':'',\ '#冰糖赠书#':'蒋媛媛','#冰糖书影音#':'云曈','#老照片#':'彭二丹','#直播#':'蒋媛媛','#送福利#':'彭二丹','#冰糖说英语#':'彭二丹',} work_1 = {'01-08':'张壮','01-09':'王修齐','01-10':'徐露露','01-11':'张雪辰','01-12':'孙艳丽','01-13':'马玉龙','01-14':'陈之羽',} document_name = '150周(第十九教学周)数据统计(2018.1.08-2018.1.14)' timt_start = '01-08' time_conclude = '01-14'
上一篇: 用 Python 实现 case-by-case 的模式匹配方法
下一篇: 轻松入门Apache教程