fix:twitter 过滤

c959a447 · liyang · 0832e447 · c959a447
Commit c959a447 authored Jul 28, 2023 by liyang
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 12 deletions

pc_twitter.py pc_twitter.py +14 -12

No files found.
--- a/pc_twitter.py
+++ b/pc_twitter.py
@@ -37,7 +37,6 @@ def reptile(browser=None, search_word=""):
    print(f"搜索词:{search_word}")
    base_url = "https://twitter.com/"
    browser = browser or create(no_headless=False, using_user_data=True)
-    # print(browser)
    # 打开网页
    browser.get(base_url)
    time.sleep(2)
@@ -66,7 +65,6 @@ def reptile(browser=None, search_word=""):
        button_login.click()
        time.sleep(2)
    except:
-        # print("------")
        error = ""

    url = 'https://twitter.com/search?q=' + search_word + '&src=typed_query'
@@ -82,7 +80,6 @@ def reptile(browser=None, search_word=""):
                                                 f"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']")
    length = len(element_authors_list)
    for index in range(length):
-        # print(index)
        soup = BeautifulSoup(element_content_list[index].get_attribute("outerHTML"), "html.parser")
        # 查找time标签
        try:
@@ -105,14 +102,23 @@ def reptile(browser=None, search_word=""):
            div_elements = soup.find("div").findChildren("div", recursive=False)
            # div_tags = soup.find_all("div", recursive=False)
            for item in video_list:
+                div = soup.new_tag('div')
                img_tag = soup.new_tag('img')
                img_tag["src"] = item["poster"]
+                div.append(img_tag)
                for items in div_elements:
-                    if hasattr(items,"aria-labelledby"):
+                    attr = False
+                    try:
+                        attr = items["aria-labelledby"]
+                    except:
+                        attr = False
+                    if attr:
+                        # div["aria-labelledby"] = "sdfsf"
                        # div[@aria-labelledby="xx"] 替换为img标签【内容含有视频的替换为img标签】
-                        items.replaceWith(img_tag)
+                        items.replaceWith(div)
+                    else:
+                        error =""
        else:
-            # print("")
            error = ""

        image_list = soup.find_all("img")
@@ -136,7 +142,6 @@ def reptile(browser=None, search_word=""):
                        element['src'] = access_address
                        picture_url.append(download_dir)
        else:
-            # print("")
            error = ""

        # 删除多余div
@@ -148,7 +153,6 @@ def reptile(browser=None, search_word=""):
                item.extract()

        content = soup.prettify()
-        print("")
        # ---------------- 判断类型 start ----------
        # 类型
        content_type = ""
@@ -176,10 +180,9 @@ def reptile(browser=None, search_word=""):
        }
        # --------------- 组装数据 end---------------------
        data.append(obj)
+        soup = ""
+        time.sleep(0.1)

-    # 发送爬取数据到java服务
-    # print('----------------------')
-    # print(data)
    if len(data) > 0:
        # 保存json文件到本地
        # log.debug(os.path.abspath("../"))
@@ -212,7 +215,6 @@ def main():
    # 请求关键词
    response = getReptileTask()
    global status_task
-    # print(response)
    if response['status_code'] == 200 and response['data']['code'] == 200:
        log.debug("call success")
        search_word = ""