時(shí)間:2023-05-14 22:27:01 | 來源:網(wǎng)站運(yùn)營(yíng)
時(shí)間:2023-05-14 22:27:01 來源:網(wǎng)站運(yùn)營(yíng)
用python爬取微博評(píng)論 ▏附源碼:"""構(gòu)造GET請(qǐng)求參數(shù)""" data = { 'id': weibo_id, 'mid': weibo_id, 'max_id': max_id, 'max_id_type': max_id_type }
"""構(gòu)造GET請(qǐng)求參數(shù)""" data = { 'id': weibo_id, 'mid': weibo_id, 'max_id': max_id, 'max_id_type': max_id_type }
# !/usr/bin/nev python# -*-coding:utf8-*-from datetime import datetimefrom requests_html import HTMLSessionimport re, timeimport tkinter as tkimport urllib3 # 解除警告urllib3.disable_warnings()session = HTMLSession()class WBSpider(object): def __init__(self): """定義可視化窗口,并設(shè)置窗口和主題大小布局""" self.window = tk.Tk() self.window.title('微博評(píng)論信息采集') self.window.geometry('800x600') """創(chuàng)建label_user按鈕,與說明書""" self.label_user = tk.Label(self.window, text='請(qǐng)輸入要爬取的微博評(píng)論的地址:', font=('Arial', 12), width=30, height=2) self.label_user.pack() """創(chuàng)建label_user關(guān)聯(lián)輸入""" self.entry_user = tk.Entry(self.window, show=None, font=('Arial', 14)) self.entry_user.pack(after=self.label_user) """創(chuàng)建label_passwd按鈕,與說明書""" self.label_passwd = tk.Label(self.window, text="請(qǐng)輸入登陸后的cookie:", font=('Arial', 12), width=30, height=2) self.label_passwd.pack() """創(chuàng)建label_passwd關(guān)聯(lián)輸入""" self.entry_passwd = tk.Entry(self.window, show=None, font=('Arial', 14)) self.entry_passwd.pack(after=self.label_passwd) """創(chuàng)建Text富文本框,用于按鈕操作結(jié)果的展示""" self.text1 = tk.Text(self.window, font=('Arial', 12), width=85, height=22) self.text1.pack() """定義按鈕1,綁定觸發(fā)事件方法""" self.button_1 = tk.Button(self.window, text='爬取', font=('Arial', 12), width=10, height=1, command=self.parse_hit_click_1) self.button_1.pack(before=self.text1) """定義按鈕2,綁定觸發(fā)事件方法""" self.button_2 = tk.Button(self.window, text='清除', font=('Arial', 12), width=10, height=1, command=self.parse_hit_click_2) self.button_2.pack(anchor="e") def parse_hit_click_1(self): """定義觸發(fā)事件1,調(diào)用main函數(shù)""" user_url = self.entry_user.get() pass_wd = self.entry_passwd.get() self.main(user_url, pass_wd) def main(self, user_url, pass_wd): i = 1 headers_1 = { 'cookie': pass_wd, 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36' } headers_2 ={ "referer": "微博", 'cookie': pass_wd, 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Mobile Safari/537.36' } uid_1 = re.findall('/(.*?)#', user_url)[0] uid_2 = uid_1.split('/', 3)[3] # print(uid_2) url_1 = f'https://weibo.com/ajax/statuses/show?id={uid_2}' prox = '' response = session.get(url_1, proxies={'http': prox, 'https': prox}, headers=headers_1, verify=False).content.decode() # print(response) weibo_id = re.findall('"id":(.*?),"idstr"', response)[0] # print(weibo_id) # 構(gòu)造起始地址 start_url = f'https://m.weibo.cn/comments/hotflow?id={weibo_id}&mid={weibo_id}&max_id_type=0' """ 2.發(fā)送請(qǐng)求,獲取響應(yīng): 解析起始的url地址 :return: """ prox = '' response = session.get(start_url, proxies={'http': prox, 'https': prox}, headers=headers_2, verify=False).json() """提取翻頁(yè)的max_id""" max_id = response['data']['max_id'] """提取翻頁(yè)的max_id_type""" max_id_type = response['data']['max_id_type'] """構(gòu)造GET請(qǐng)求參數(shù)""" data = { 'id': weibo_id, 'mid': weibo_id, 'max_id': max_id, 'max_id_type': max_id_type } """解析評(píng)論內(nèi)容""" self.parse_response_data(response, i) i+=1 """參數(shù)傳遞,方法回調(diào)""" self.parse_page_func(data, weibo_id, headers_2, i) def parse_page_func(self, data, weibo_id, headers_2, i): """ :return: """ start_url = '微博-出錯(cuò)了 prox = '' response = session.get(start_url, proxies={'http': prox, 'https': prox}, headers=headers_2, params=data, verify=False).json() """提取翻頁(yè)的max_id""" max_id = response['data']['max_id'] """提取翻頁(yè)的max_id_type""" max_id_type = response['data']['max_id_type'] """構(gòu)造GET請(qǐng)求參數(shù)""" data = { 'id': weibo_id, 'mid': weibo_id, 'max_id': max_id, 'max_id_type': max_id_type } """解析評(píng)論內(nèi)容""" self.parse_response_data(response, i) i+=1 """遞歸回調(diào)""" self.parse_page_func(data, weibo_id, headers_2, i) def parse_response_data(self, response, i): """ 從響應(yīng)中提取評(píng)論內(nèi)容 :return: """ """提取出評(píng)論大列表""" data_list = response['data']['data'] # print(data_list) for data_json_dict in data_list: # 提取評(píng)論內(nèi)容 try: texts_1 = data_json_dict['text'] """需要sub替換掉標(biāo)簽內(nèi)容""" # 需要替換的內(nèi)容,替換之后的內(nèi)容,替換對(duì)象 alts = ''.join(re.findall(r'alt=(.*?) ', texts_1)) texts = re.sub("<span.*?</span>", alts, texts_1) # 點(diǎn)贊量 like_counts = str(data_json_dict['like_count']) # 評(píng)論時(shí)間 格林威治時(shí)間---需要轉(zhuǎn)化為杭州時(shí)間 created_at = data_json_dict['created_at'] std_transfer = '%a %b %d %H:%M:%S %z %Y' std_create_times = str(datetime.strptime(created_at, std_transfer)) # 性別 提取出來的是 f gender = data_json_dict['user']['gender'] genders = '女' if gender == 'f' else '男' # 用戶名 screen_names = data_json_dict['user']['screen_name'] print(screen_names, genders, std_create_times, texts, like_counts) print() except Exception as e: continue print('*******************************************************************************************') print() print(f'*****第{i}頁(yè)評(píng)論打印完成*****') def parse_hit_click_2(self): """定義觸發(fā)事件2,刪除文本框中內(nèi)容""" self.entry_user.delete(0, "end") self.entry_passwd.delete(0, "end") self.text1.delete("1.0", "end") def center(self): """創(chuàng)建窗口居中函數(shù)方法""" ws = self.window.winfo_screenwidth() hs = self.window.winfo_screenheight() x = int((ws / 2) - (800 / 2)) y = int((hs / 2) - (600 / 2)) self.window.geometry('{}x{}+{}+{}'.format(800, 600, x, y)) def run_loop(self): """禁止修改窗體大小規(guī)格""" self.window.resizable(False, False) """窗口居中""" self.center() """窗口維持--持久化""" self.window.mainloop()if __name__ == '__main__': w = WBSpider() w.run_loop()
來源:CSDN博主「主打Python」原文鏈接:
關(guān)鍵詞:評(píng)論
客戶&案例
營(yíng)銷資訊
關(guān)于我們
客戶&案例
營(yíng)銷資訊
關(guān)于我們
微信公眾號(hào)
版權(quán)所有? 億企邦 1997-2025 保留一切法律許可權(quán)利。