Commit c959a447 authored by liyang's avatar liyang

fix:twitter 过滤

parent 0832e447
......@@ -37,7 +37,6 @@ def reptile(browser=None, search_word=""):
print(f"搜索词:{search_word}")
base_url = "https://twitter.com/"
browser = browser or create(no_headless=False, using_user_data=True)
# print(browser)
# 打开网页
browser.get(base_url)
time.sleep(2)
......@@ -66,7 +65,6 @@ def reptile(browser=None, search_word=""):
button_login.click()
time.sleep(2)
except:
# print("------")
error = ""
url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query'
......@@ -82,7 +80,6 @@ def reptile(browser=None, search_word=""):
f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
length = len(element_authors_list)
for index in range(length):
# print(index)
soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"), "html.parser")
# 查找time标签
try:
......@@ -105,14 +102,23 @@ def reptile(browser=None, search_word=""):
div_elements = soup.find("div").findChildren("div", recursive=False)
# div_tags = soup.find_all("div", recursive=False)
for item in video_list:
div = soup.new_tag('div')
img_tag = soup.new_tag('img')
img_tag["src"] = item["poster"]
div.append(img_tag)
for items in div_elements:
if hasattr(items,"aria-labelledby"):
attr = False
try:
attr = items["aria-labelledby"]
except:
attr = False
if attr:
# div["aria-labelledby"] = "sdfsf"
# div[@aria-labelledby="xx"] 替换为img标签【内容含有视频的替换为img标签】
items.replaceWith(img_tag)
items.replaceWith(div)
else:
error =""
else:
# print("")
error = ""
image_list = soup.find_all("img")
......@@ -136,7 +142,6 @@ def reptile(browser=None, search_word=""):
element['src'] = access_address
picture_url.append(download_dir)
else:
# print("")
error = ""
# 删除多余div
......@@ -148,7 +153,6 @@ def reptile(browser=None, search_word=""):
item.extract()
content = soup.prettify()
print("")
# ---------------- 判断类型 start ----------
# 类型
content_type = ""
......@@ -176,10 +180,9 @@ def reptile(browser=None, search_word=""):
}
# --------------- 组装数据 end---------------------
data.append(obj)
soup = ""
time.sleep(0.1)
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if len(data) > 0:
# 保存json文件到本地
# log.debug(os.path.abspath("../"))
......@@ -212,7 +215,6 @@ def main():
# 请求关键词
response = getReptileTask()
global status_task
# print(response)
if response['status_code'] == 200 and response['data']['code'] == 200:
log.debug("call success")
search_word = ""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment