# -*- coding: utf-8 -*- """ Created on Mon Dec 13 22:24:58 2021 @author: 18742 """ from selenium import webdriver import time from datetime import datetime from datetime import timedelta import pandas as pd import requests import json from fake_useragent import UserAgent from bs4 import BeautifulSoup import random import logging import urllib.error from lxml import etree from lxml import html #直接获得这个块,再使用string(.) ##获取页面中的数据 #def get_info2(wb): # wb.implicitly_wait(10) ## post = wb.find_element_by_xpath("//*/div[@class='css-1dbjc4n']/div/span").text # post = wb.find_element_by_xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]") # # data = selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]") # post_list = [] # for i in range(len(data)): # post = data[i].xpath('string(.)').strip()#将文本进行合并并去除文字前后的空行 # print(post) # post = str(post) # post_list.append("".join(post)) # # like = wb.find_element_by_xpath("//*/div[@data-testid='like']//div/span/span").text # retweet = wb.find_element_by_xpath("//*/div[@data-testid='retweet']//div/span/span").text # reply = wb.find_element_by_xpath("//*/div[@data-testid='reply']//div/span/span").text # data = { ## "good":good, # "post":post, # "like":like, # "retweet":retweet, # "reply":reply} # return data # # # ## 获取页面内所有帖子的url #def get_posts(url): # """ # url:包含所有帖子的浏览页面 # """ # wb = webdriver.Chrome() # wb.get(url) # time.sleep(3) # # # js = 'return action=document.body.scrollHeight' # height = wb.execute_script(js) # wb.execute_script('window.scrollTo(0, document.body.scrollHeight)') # time.sleep(5) # # t1 = int(time.time()) # status = True # num = 0 # # while status: # t2 = int(time.time()) # if t2 - t1 < 30: # new_height = wb.execute_script(js) # if new_height > height: # time.sleep(1) # wb.execute_script( # 'window.scrollTo(0, document.body.scrollHeight)') # height = new_height # t1 = int(time.time()) # elif num < 3: # time.sleep(3) # num = num + 1 # else: # 超时且重试后停止,到底页面底部 # status = False # # data = get_info2(wb) # # return data # ==可以跑通的部分=========================================================================== # #chromedriver直接获取posts # ##获取url链接 # url = 'https://twitter.com/search?q=nuclear%20waste%20water&src=typed_query' # post_list = get_posts(url) # # # wb = webdriver.Chrome() # wb.get(url) # time.sleep(3) # selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式 # like = selector.xpath("//div[@data-testid='like']//div/span/span/text()") # print(like) # # data = selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]") # post_list = [] # for i in range(len(data)): # post = data[i].xpath('string(.)').strip()#将文本进行合并并去除文字前后的空行 # print(post) # post = str(post) # post_list.append("".join(post)) # # # ============================================================================= # 获取页面内所有帖子的url def get_posts(url): """ url:包含所有帖子的浏览页面 """ wb = webdriver.Chrome() wb.get(url) time.sleep(3) #处理网页加载 js = 'return action=document.body.scrollHeight' height = wb.execute_script(js) wb.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(5) t1 = int(time.time()) status = True num = 0 post_list = [] while status: t2 = int(time.time()) if t2 - t1 < 30: selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式 infos = selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")#//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div for info in infos: post = info.xpath("string(.)").strip() post_list.append(post) new_height = wb.execute_script(js) if new_height > height: time.sleep(1) wb.execute_script( 'window.scrollTo(0, document.body.scrollHeight)') height = new_height t1 = int(time.time()) elif num < 3: time.sleep(3) num = num + 1 else: # 超时且重试后停止,到底页面底部 status = False return post_list ##获取页面html,或许是因为异步加载的数据的完整页面代码无法获取的原因 # selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式 # infos = selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]") # # post_list = [] # for info in infos: # post = info.xpath("string(.)").strip() # post_list.append(post) # return post_list # # #使用webelement来进行爬取 # infos = wb.find_element_by_xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]") # post_list = [] # for info in infos: # post = info.text.strip() # post_list.append(post) # return post_list url = 'https://twitter.com/search?q=Beijing%20Winter%20Olympics%20Opening%20Ceremony&src=typed_query' post_list = get_posts(url) comm_df = pd.DataFrame(post_list) print('here') comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_Olympic_ceremony2.csv', encoding='utf_8_sig', index=False) ## ##只要推文的数据 #url = 'https://twitter.com/search?q=nuclear%20waste%20water&src=typed_query' #post_list = get_posts(url) #comm_df = pd.DataFrame(post_list) #comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_nuclear.csv', encoding='utf_8_sig', index=False) # #url = 'https://twitter.com/search?q=shenzhou-13&src=typed_query' #post_list = get_posts(url) #comm_df = pd.DataFrame(post_list) #comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_shenzhou.csv', encoding='utf_8_sig', index=False) # #url = 'https://twitter.com/search?q=China%20lunar%20soil&src=typed_query' #post_list = get_posts(url) #comm_df = pd.DataFrame(post_list) #comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_chinalunar.csv', encoding='utf_8_sig', index=False) # #url = 'https://twitter.com/search?q=Abdulrazak%20Gurnah%20Nobel%20Prize%20in%20Literature&src=typed_query' #post_list = get_posts(url) #comm_df = pd.DataFrame(post_list) #comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_nobel.csv', encoding='utf_8_sig', index=False) # #url = 'https://twitter.com/search?q=Vietnam%20Factories%20&src=typed_query' #post_list = get_posts(url) #comm_df = pd.DataFrame(post_list) #comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_vietnam.csv', encoding='utf_8_sig', index=False) # #url = 'https://twitter.com/search?q=China%20provide%20vaccines&src=typed_query' #post_list = get_posts(url) #comm_df = pd.DataFrame(post_list) #comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_chinavaccine.csv', encoding='utf_8_sig', index=False) # #url = 'https://twitter.com/search?q=Impact%20of%20Brexit%20on%20economy%20%27worse%20than%20Covid%27&src=typed_query' #post_list = get_posts(url) #comm_df = pd.DataFrame(post_list) #comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_brexiteconomy.csv', encoding='utf_8_sig', index=False) # #url = 'https://twitter.com/search?q=rich%20countries%20hogging%20vaccines&src=typed_query' #post_list = get_posts(url) #comm_df = pd.DataFrame(post_list) #comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_richhogging.csv', encoding='utf_8_sig', index=False) # #url = 'https://twitter.com/search?q=ease%20travel%20restrictions&src=typed_query' #post_list = get_posts(url) #comm_df = pd.DataFrame(post_list) #comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_easetravelres.csv', encoding='utf_8_sig', index=False) # #url = 'https://twitter.com/search?q=US%20reaches%20agreement%20to%20end%20European%20digital%20services%20taxes&src=typed_query' #post_list = get_posts(url) #comm_df = pd.DataFrame(post_list) #comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_agreeontaxr.csv', encoding='utf_8_sig', index=False) # #1,2,3(1,2,3,4) # # # #like = info.xpath("//div[@data-testid='like']//div/span/span/text()") #retweet = info.xpath("//div[@data-testid='retweet']//div/span/span/text()") #reply = info.xpath("//div[@data-testid='reply']//div/span/span/text()") #data = { # "post":post, # "like":like, # "retweet":retweet, # "reply":reply} # # # # # # ##获取页面中的数据 #def get_info(wb,url,list,m): # driver.implicitly_wait(10) # m.append(driver.page_source) # selector = html.etree.HTML(driver.page_source)# # 是将HTML转化为二进制/html 格式 # #可以先获取代码,再慢慢尝试写出一个可以使用的xpath或者正则表达式 # m.append(selector) # # infos = selector.xpath("//div[@class='css-1dbjc4n']")#相当于一个网页中有多个这个结构,然后把所有这个结构的数据都放在list中遍历取出需要的数据 # m.append(infos) ## print(infos) # for info in infos: # ###需要的信息 # # data = info.xpath("//a[@class='J_ClickStat']/@href")##找商品的名字,写一个大一点的范围 ## good = data.xpath("string(.)").strip() # post = [] # data= info.xpath("//*/div[@class='css-1dbjc4n']/div/span/text()") # for i in range(len(data)): # post.append(data[i].xpath('string(.)')) ## post= data.xpath("string(.)").strip() ## for span in post: ## #格式化当前节点 ## post =span.xpath('string(.)') # like = info.xpath("//*/div[@data-testid='like']//span/span/text()") # # retweet = info.xpath("//*/div[@data-testid='retweet']//span/span/text()") # reply = info.xpath("//*/div[@data-testid='reply']//span/span/text()") # data = { ## "good":good, # "post":post, # "like":like, # "retweet":retweet, # "reply":reply # } # print(data) # list.append(data) # return list #