Commit d12c76af authored by liyang's avatar liyang

feat:ins爬虫数据条件过滤优化

parent bda48e6b
...@@ -4,7 +4,7 @@ def get_log_path(): ...@@ -4,7 +4,7 @@ def get_log_path():
def get_base_url(): def get_base_url():
return "http://192.168.0.118:8081/" return "http://192.168.0.104:8081/"
def get_base_file_url(): def get_base_file_url():
......
...@@ -72,7 +72,8 @@ def reptile(browser=None, search_word=""): ...@@ -72,7 +72,8 @@ def reptile(browser=None, search_word=""):
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='dialog']/div/div[2]"))) wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='dialog']/div/div[2]")))
# 提取其他 # 提取其他
author = browser.find_element("xpath", "//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]//a") author = browser.find_element("xpath",
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]/div/header/div[2]/div[1]/div[1]//a")
content_element = browser.find_element("xpath", content_element = browser.find_element("xpath",
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1") "//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1")
...@@ -90,21 +91,35 @@ def reptile(browser=None, search_word=""): ...@@ -90,21 +91,35 @@ def reptile(browser=None, search_word=""):
# 过滤视频 # 过滤视频
video_list = browser.find_elements("xpath", "//div[@role='dialog']/div//article/div/div[1]/div/div[1]//video") video_list = browser.find_elements("xpath", "//div[@role='dialog']/div//article/div/div[1]/div/div[1]//video")
for key, item in enumerate(img_list): for key, item in enumerate(img_list):
src = ""
img_soup = ""
if len(video_list) == 0: if len(video_list) == 0:
if key == 0: if key == 0:
title = item.get_attribute("alt") title_str_list = item.get_attribute("alt").split("'")
# 下载图片至本地,替换标签中的src if len(title_str_list) >= 3:
id = str(int(time.time())) title = title_str_list[2]
else:
title = ""
img_soup = BeautifulSoup(item.get_attribute("outerHTML"), "html.parser").find("img")
src = item.get_attribute("src")
else:
# 有视频,图片链接从列表中提取
title = ""
a_soup = BeautifulSoup(element_link_list[index].get_attribute("outerHTML"), "html.parser")
# img_element = element_link_list[index].find_element("xpath","img")
img_soup = a_soup.find("img")
src = img_soup["src"]
str_list = link_str.split("/")
img_id = str_list[len(str_list) - 2]
# 下载地址 # 下载地址
download_dir = f'{os.path.join(file_dir, f"{id}.jpg")}' download_dir = f'{os.path.join(file_dir, f"{img_id}.jpg")}'
# 访问地址 # 访问地址
access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg' access_address = f'{get_base_file_url()}{table_name.split("_")[1]}/{img_id}.jpg'
# 下载状态 # 下载状态
status = download_image(item.get_attribute("src"), download_dir) status = download_image(src, download_dir)
if status: if status:
# 将图片追加到内容中 # 将图片追加到内容中
img_soup = BeautifulSoup(item.get_attribute("outerHTML"), "html.parser") img_soup["src"] = access_address
img_soup.img["src"] = access_address
# print(img_soup.prettify()) # print(img_soup.prettify())
soup.append(img_soup) soup.append(img_soup)
picture_url.append(access_address) picture_url.append(access_address)
...@@ -176,7 +191,7 @@ def main(): ...@@ -176,7 +191,7 @@ def main():
log.debug("call success") log.debug("call success")
search_word = "" search_word = ""
for item in response['data']['rows']: for item in response['data']['rows']:
if item['name'] == 'pms_instagram': if item['name'] == 'instagram':
search_word = item['keyword'] search_word = item['keyword']
table_name = item['tableName'] table_name = item['tableName']
status_task = int(item["status"]) status_task = int(item["status"])
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment