Commit c959a447 authored by liyang's avatar liyang

fix:twitter 过滤

parent 0832e447
...@@ -37,7 +37,6 @@ def reptile(browser=None, search_word=""): ...@@ -37,7 +37,6 @@ def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}") print(f"搜索词:{search_word}")
base_url = "https://twitter.com/" base_url = "https://twitter.com/"
browser = browser or create(no_headless=False, using_user_data=True) browser = browser or create(no_headless=False, using_user_data=True)
# print(browser)
# 打开网页 # 打开网页
browser.get(base_url) browser.get(base_url)
time.sleep(2) time.sleep(2)
...@@ -66,7 +65,6 @@ def reptile(browser=None, search_word=""): ...@@ -66,7 +65,6 @@ def reptile(browser=None, search_word=""):
button_login.click() button_login.click()
time.sleep(2) time.sleep(2)
except: except:
# print("------")
error = "" error = ""
url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query' url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query'
...@@ -82,7 +80,6 @@ def reptile(browser=None, search_word=""): ...@@ -82,7 +80,6 @@ def reptile(browser=None, search_word=""):
f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']") f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
length = len(element_authors_list) length = len(element_authors_list)
for index in range(length): for index in range(length):
# print(index)
soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"), "html.parser") soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"), "html.parser")
# 查找time标签 # 查找time标签
try: try:
...@@ -105,14 +102,23 @@ def reptile(browser=None, search_word=""): ...@@ -105,14 +102,23 @@ def reptile(browser=None, search_word=""):
div_elements = soup.find("div").findChildren("div", recursive=False) div_elements = soup.find("div").findChildren("div", recursive=False)
# div_tags = soup.find_all("div", recursive=False) # div_tags = soup.find_all("div", recursive=False)
for item in video_list: for item in video_list:
div = soup.new_tag('div')
img_tag = soup.new_tag('img') img_tag = soup.new_tag('img')
img_tag["src"] = item["poster"] img_tag["src"] = item["poster"]
div.append(img_tag)
for items in div_elements: for items in div_elements:
if hasattr(items,"aria-labelledby"): attr = False
try:
attr = items["aria-labelledby"]
except:
attr = False
if attr:
# div["aria-labelledby"] = "sdfsf"
# div[@aria-labelledby="xx"] 替换为img标签【内容含有视频的替换为img标签】 # div[@aria-labelledby="xx"] 替换为img标签【内容含有视频的替换为img标签】
items.replaceWith(img_tag) items.replaceWith(div)
else:
error =""
else: else:
# print("")
error = "" error = ""
image_list = soup.find_all("img") image_list = soup.find_all("img")
...@@ -136,7 +142,6 @@ def reptile(browser=None, search_word=""): ...@@ -136,7 +142,6 @@ def reptile(browser=None, search_word=""):
element['src'] = access_address element['src'] = access_address
picture_url.append(download_dir) picture_url.append(download_dir)
else: else:
# print("")
error = "" error = ""
# 删除多余div # 删除多余div
...@@ -148,7 +153,6 @@ def reptile(browser=None, search_word=""): ...@@ -148,7 +153,6 @@ def reptile(browser=None, search_word=""):
item.extract() item.extract()
content = soup.prettify() content = soup.prettify()
print("")
# ---------------- 判断类型 start ---------- # ---------------- 判断类型 start ----------
# 类型 # 类型
content_type = "" content_type = ""
...@@ -176,10 +180,9 @@ def reptile(browser=None, search_word=""): ...@@ -176,10 +180,9 @@ def reptile(browser=None, search_word=""):
} }
# --------------- 组装数据 end--------------------- # --------------- 组装数据 end---------------------
data.append(obj) data.append(obj)
soup = ""
time.sleep(0.1)
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if len(data) > 0: if len(data) > 0:
# 保存json文件到本地 # 保存json文件到本地
# log.debug(os.path.abspath("../")) # log.debug(os.path.abspath("../"))
...@@ -212,7 +215,6 @@ def main(): ...@@ -212,7 +215,6 @@ def main():
# 请求关键词 # 请求关键词
response = getReptileTask() response = getReptileTask()
global status_task global status_task
# print(response)
if response['status_code'] == 200 and response['data']['code'] == 200: if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success") log.debug("call success")
search_word = "" search_word = ""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment