fix：爬取twitter

ab634c7f · liyang · ab99c057 · ab634c7f · ab634c7f · ab634c7f
Commit ab634c7f authored Jul 13, 2023 by liyang
Hide whitespace changes
Inline Side-by-side

Showing with 69 additions and 84 deletions

pc_ptt.py pc_ptt.py +19 -7

pc_twitter.py pc_twitter.py +42 -76

pc_youtube.py pc_youtube.py +8 -1

No files found.
--- a/pc_ptt.py
+++ b/pc_ptt.py
@@ -47,7 +47,7 @@ def reptile(browser=None, search_word=""):
            for index_two in range(length_two):
                # 标题不包含"公告"和"看板"
                if re.findall("公告", element_list[index_two].text) or re.findall("看板", element_list[index_two].text):
-                    a=1
+                    a = 1
                else:
                    log.debug(f"正在爬取分类：{type_title}-第{index_two + 1}条")
                    # 使用正则表达式进行匹配
@@ -67,7 +67,8 @@ def reptile(browser=None, search_word=""):
                                                             "//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
                    except:
-                        log.error("xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
+                        log.error(
+                            "xpath 找不到元素：//div[@id='main-content']/div[3]//span[@class='article-meta-value']")
                        log.debug(f'页面链接：{browser_current_url}')
                        # 浏览器返回上一页
                        browser.back()
@@ -117,7 +118,7 @@ def reptile(browser=None, search_word=""):
                            tag.decompose()
                    except:
                        # log.debug("查找所有的<a>标签失败")
-                        a=1
+                        a = 1
                    try:
                        # 找到所有第一级标签为 `div` 的元素
                        div_elements = soup.find_all('div')
@@ -133,7 +134,7 @@ def reptile(browser=None, search_word=""):
                    except:
                        # log.debug("删除第一级div失败")
-                        a=2
+                        a = 2
                    html = soup.prettify().replace('amp;', '')
                    # ------------------ content 过滤 end--------------
@@ -149,7 +150,7 @@ def reptile(browser=None, search_word=""):
                    }
                    # --------------- 组装数据 end---------------------
-                    if search_word is None or search_word==str(search_word):
+                    if search_word is None or search_word == str(search_word):
                        data.append(obj)
                    else:
                        # 使用正则表达式进行匹配
@@ -161,7 +162,7 @@ def reptile(browser=None, search_word=""):
                            data.append(obj)
                        else:
                            # log.debug("未找到匹配的字符串")
-                            a=3
+                            a = 3
                    # 浏览器返回上一页
                    browser.back()
                    element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
@@ -206,9 +207,14 @@ def reptile(browser=None, search_word=""):
    # time.sleep(3)
    browser.quit()
 def main():
+    """
+    """
    # 请求关键词
    response = getReptileTask()
+    global status_task
    # print(response)
    if response['status_code'] == 200 and response['data']['code'] == 200:
        log.debug("call success")
@@ -217,8 +223,12 @@ def main():
            if item['name'] == 'ptt':
                search_word = item['keyword']
                table_name = item['tableName']
+                status_task = item["status"]
        # 简体转繁体
-        reptile(None, convert_to_traditional(search_word))
+        if status_task == 0:
+            reptile(None, convert_to_traditional(search_word))
+        else:
+            log.debug("爬取任务未启用")
    else:
        log.debug("call failed")
        reptile(None, '')
@@ -228,5 +238,7 @@ def main():
 # 全局变量
 data = []
 table_name = "pms_ptt"
+# 是否启用
+status_task = '0'
 # 调用main函数
 main()
--- a/pc_twitter.py
+++ b/pc_twitter.py
@@ -22,85 +22,51 @@ def reptile(browser=None, search_word=""):
    # ['--headless']
    browser = browser or create()
    # print(browser)
-    # browser = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver')
-    # endDate = startDate = startDate + timedelta(days=i)
    # 打开网页
    browser.get(url)
-    # WebDriverWait(browser,10).
+    time.sleep(3)
-    # 打开登录窗口
+    try:
-    # open_button_login = WebDriverWait(browser, 10).until(
+        # 检测是否要登录
-    #     EC.presence_of_element_located((By.XPATH, "//a[@data-testid='login']")))
+        login_input = browser.find_element('xpath',"//input[@autocomplete='username']")
-    # open_button_login.click()
+        login_input.send_keys("liyang1851603")
-    # time.sleep(5)
+        # 获取下一步按钮
+        buttons = browser.find_element('xpath', "//div[@role='button'][2]")
-    # 获取账号密码输入框
+        buttons.click()
-    # input_email_element = WebDriverWait(browser, 10).until(
+        time.sleep(3)
-    #     EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']")))
+        password_input =  browser.find_element('xpath', "//input[@autocomplete='current-password']")
-    # # 获取下一步按钮
+        password_input.send_keys("liyang19970814")
-    # buttons = WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@role='button']")))
+        # 获取登录按钮
-    # for item in buttons:
+        button_login = browser.find_element('xpath',"//div[@data-testid='LoginForm_Login_Button']")
-    #     print(BeautifulSoup(item, 'html.parser'))
+        button_login.click()
-    # soup = BeautifulSoup(page_content, 'html.parser')
+    except:
-    # input_pwd_element = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, "//input[@name='pass']")))
+        print("------")
-    # # 获取登录按钮
+    # print(333333)
-    # button_login = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[@name='login']")))
-    #
-    # input_email_element.send_keys("liyang19970814@gmail.com")
-    # input_pwd_element.send_keys("xn89kiPT/^Kaeg#")
-    # button_login.click()
-    # print("---------------")
-    # print(input_email_element)
-    # print(input_pwd_element)
-    # print(button_login)
-    # logger.debug(button)
-    # 模拟点击按钮多次加载更多数据
-    # while button.is_enabled():
-    #     time.sleep(2)  # 等待一段时间，确保页面加载完毕
-    #     try:
-    #         button.click()
-    #         button = WebDriverWait(browser, 5).until(
-    #             EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
-    #     except:
-    #         break
    # time.sleep(3)
-    # 获取完整的分页数据
+    # 作者
-    # page_content = browser.page_source
+    element_authors_list = browser.find_elements('xpath',
-    # soup = BeautifulSoup(page_content, 'html.parser')
+                                          "//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']//a[@role='link']//div[@dir='ltr']")
-    # print("----------")
+    print(element_authors_list)
-    # print(soup)
+    print("2222")
-    # list_news = soup.find_all('li', {"class": "css-1l4w6pd"})
+    # 发布时间
+    element_release_list = browser.find_elements('xpath',
-    # for index, item in enumerate(list_news):
+                                           "//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']//div[2]//time[@datetime]")
-    #     logger.debug(item)
+    # 标题
-    #     # 抓取图片
+    # element_title_list = browser.find_element('xpath',)
-    #     image_key = image_key + 1
+    # 内容
-    #     url_element = item.find('img', {"class": "css-rq4mmj"})
+    element_content_list = browser.find_elements('xpath',"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]")
-    #     image_url = url_element['src'] if url_element else ""
+    # print(element_content_list)
-    #     # logger.debug(url)
+    length = len(element_authors_list)
-    #     if image_url:
+    print(length)
-    #         # logger.debug(url)
+    for index in range(length):
-    #         # # 下载图片
+        author = element_authors_list[index].text
-    #         #
+        release_time = element_release_list[index].get_attribute("datetime")
-    #         filename = f"{image_key}.jpg"
+        content = element_content_list[index]
-    #         # logger.debug(filename)
+        print(content)
-    #         # sys.exit()
+        # 内容过滤
-    #         download_image(image_url, f'{fileDir}images/{filename}')
+        # 使用BeautifulSoup解析HTML
-    #         # 抓取文字
+        soup = BeautifulSoup(content.get_attribute("innerHTML"), 'html.parser')
-    #         title_element = item.find('h4', {"class": "css-2fgx4k"})
+        print(soup)
-    #         introduction_element = item.find('p', {"class": "css-16nhkrn"})
+        print("-----")
-    #         title = title_element.get_text() if title_element else ""
-    #         introduction = introduction_element.get_text() if introduction_element else ""
-    #         news = {
-    #             "title": title,
-    #             "introduction": introduction,
-    #             "imageName": filename
-    #         }
-    #         data.append(news)
-    # logger.debug(data)
-    # 将数据保存到文件中
-    # with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
-    #     json.dump(data, file, indent=2, ensure_ascii=False)
    # browser.close()
    # # 关闭浏览器驱动
    # browser.quit()

--- a/pc_youtube.py
+++ b/pc_youtube.py
@@ -125,6 +125,7 @@ def main():
    # 请求关键词
    response = getReptileTask()
    # print(response)
+    global status_task
    if response['status_code'] == 200 and response['data']['code'] == 200:
        log.debug("call success")
        search_word = ""
@@ -132,7 +133,11 @@ def main():
            if item['name'] == 'youtube':
                search_word = item['keyword']
                table_name = item['tableName']
-        reptile(None, convert_to_traditional(search_word))
+                status_task = item["status"]
+        if status_task == 0:
+            reptile(None, convert_to_traditional(search_word))
+        else:
+            log.debug("爬取任务未启用")
    else:
        log.debug("call failed")
        reptile(None, '')
@@ -142,5 +147,7 @@ def main():
 # 全局变量
 data = []
 table_name = "pms_youtube"
+# 是否启用
+status_task = '0'
 # 调用main函数
 main()