Commit d6d072eb authored by liyang's avatar liyang

fix:爬取数据入库

parent 3769935a
2023-07-11 11:19:10,260 DEBUG pc_ptt.py : main [line: 191] call success
2023-07-11 11:19:23,852 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689003950.A.80A.html
2023-07-11 11:19:25,571 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689005722.A.1F9.html
2023-07-11 11:19:27,176 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689006932.A.34C.html
2023-07-11 11:19:27,254 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:28,992 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689010356.A.067.html
2023-07-11 11:19:29,071 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:30,499 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689011153.A.087.html
2023-07-11 11:19:30,581 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:31,947 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689031313.A.76C.html
2023-07-11 11:19:33,412 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689032073.A.5BE.html
2023-07-11 11:19:36,361 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689035402.A.885.html
2023-07-11 11:19:38,292 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:39,809 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689038260.A.FDA.html
2023-07-11 11:19:40,723 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:42,516 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689039142.A.7D6.html
2023-07-11 11:19:42,645 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:44,100 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689039982.A.BDE.html
2023-07-11 11:19:44,235 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:46,582 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689040650.A.0C7.html
2023-07-11 11:19:47,199 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:48,810 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689041464.A.C60.html
2023-07-11 11:19:49,855 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:51,324 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689044399.A.8CD.html
2023-07-11 11:19:51,417 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:52,883 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1667716869.A.C66.html
2023-07-11 11:19:53,055 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:55,243 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689035402.A.885.html
2023-07-11 11:19:57,300 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:19:58,695 DEBUG pc_ptt.py : reptile [line: 145] C:\Users\desktop\Desktop\network-assets-admin
2023-07-11 11:19:58,696 DEBUG filse.py : save_json [line: 8] ------save json start--------
2023-07-11 11:19:58,696 DEBUG filse.py : save_json [line: 14] 文件保存路径:C:\Users\desktop\Desktop\network-assets-admin\network-assets-reptile\reptile_data\ptt\1689045598.json
2023-07-11 11:19:58,701 DEBUG pc_ptt.py : reptile [line: 149] -----------------------------
2023-07-11 11:19:58,702 DEBUG pc_ptt.py : reptile [line: 152] save file success
2023-07-11 11:19:58,703 DEBUG pc_ptt.py : reptile [line: 154] file_path:C:\Users\desktop\Desktop\network-assets-admin\network-assets-reptile\reptile_data\ptt\1689045598.json
2023-07-11 11:37:19,424 DEBUG pc_ptt.py : main [line: 191] call success
2023-07-11 11:37:36,333 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689003950.A.80A.html
2023-07-11 11:37:38,261 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689005722.A.1F9.html
2023-07-11 11:37:39,906 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689006932.A.34C.html
2023-07-11 11:37:40,071 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:37:41,530 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689010356.A.067.html
2023-07-11 11:37:41,696 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:37:44,449 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689011153.A.087.html
2023-07-11 11:37:44,588 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:37:47,501 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689031313.A.76C.html
2023-07-11 11:37:49,962 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689032073.A.5BE.html
2023-07-11 11:37:58,322 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689035402.A.885.html
2023-07-11 11:37:58,998 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:38:00,471 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689038260.A.FDA.html
2023-07-11 11:38:00,587 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:38:01,902 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689039142.A.7D6.html
2023-07-11 11:38:01,991 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:38:03,435 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689039982.A.BDE.html
2023-07-11 11:38:03,520 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:38:04,990 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689040650.A.0C7.html
2023-07-11 11:38:05,080 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:38:07,660 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689041464.A.C60.html
2023-07-11 11:38:08,151 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:38:09,655 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689044399.A.8CD.html
2023-07-11 11:38:09,755 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:38:11,149 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689045896.A.F9F.html
2023-07-11 11:38:11,225 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:38:13,823 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689045957.A.332.html
2023-07-11 11:38:13,925 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:38:15,404 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1667716869.A.C66.html
2023-07-11 11:38:15,528 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:38:18,298 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689035402.A.885.html
2023-07-11 11:38:19,266 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:38:20,709 DEBUG pc_ptt.py : reptile [line: 145] C:\Users\desktop\Desktop\network-assets-admin
2023-07-11 11:38:20,710 DEBUG filse.py : save_json [line: 8] ------save json start--------
2023-07-11 11:38:20,711 DEBUG filse.py : save_json [line: 14] 文件保存路径:C:\Users\desktop\Desktop\network-assets-admin\network-assets-reptile\reptile_data\ptt\1689046700.json
2023-07-11 11:38:20,714 DEBUG pc_ptt.py : reptile [line: 149] -----------------------------
2023-07-11 11:38:39,203 DEBUG pc_ptt.py : main [line: 191] call success
2023-07-11 11:38:20,715 DEBUG pc_ptt.py : reptile [line: 152] save file success
2023-07-11 11:39:21,884 DEBUG pc_ptt.py : reptile [line: 154] file_path:C:\Users\desktop\Desktop\network-assets-admin\network-assets-reptile\reptile_data\ptt\1689046700.json
2023-07-11 11:40:09,711 DEBUG pc_ptt.py : main [line: 191] call success
2023-07-11 11:40:17,779 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689003950.A.80A.html
2023-07-11 11:40:19,683 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689005722.A.1F9.html
2023-07-11 11:40:21,161 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689006932.A.34C.html
2023-07-11 11:40:21,270 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:22,872 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689010356.A.067.html
2023-07-11 11:40:23,151 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:24,955 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689011153.A.087.html
2023-07-11 11:40:26,122 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:27,609 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689031313.A.76C.html
2023-07-11 11:40:29,099 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689032073.A.5BE.html
2023-07-11 11:40:32,566 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689035402.A.885.html
2023-07-11 11:40:35,102 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:36,575 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689038260.A.FDA.html
2023-07-11 11:40:36,696 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:38,127 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689039142.A.7D6.html
2023-07-11 11:40:38,232 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:39,654 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689039982.A.BDE.html
2023-07-11 11:40:39,744 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:41,436 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689040650.A.0C7.html
2023-07-11 11:40:41,691 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:43,288 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689041464.A.C60.html
2023-07-11 11:40:43,645 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:45,553 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689044399.A.8CD.html
2023-07-11 11:40:46,378 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:47,919 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689045896.A.F9F.html
2023-07-11 11:40:47,993 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:49,769 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689045957.A.332.html
2023-07-11 11:40:50,295 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:51,784 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1667716869.A.C66.html
2023-07-11 11:40:51,973 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:54,220 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689035402.A.885.html
2023-07-11 11:40:55,751 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:57,133 DEBUG pc_ptt.py : reptile [line: 145] C:\Users\desktop\Desktop\network-assets-admin
2023-07-11 11:40:57,133 DEBUG filse.py : save_json [line: 8] ------save json start--------
2023-07-11 11:40:57,134 DEBUG filse.py : save_json [line: 14] 文件保存路径:C:\Users\desktop\Desktop\network-assets-admin\network-assets-reptile\reptile_data\ptt\1689046857.json
2023-07-11 11:40:57,138 DEBUG pc_ptt.py : reptile [line: 149] -----------------------------
2023-07-11 11:46:10,964 DEBUG pc_ptt.py : main [line: 191] call success
2023-07-11 11:48:44,147 DEBUG pc_ptt.py : main [line: 191] call success
2023-07-11 11:49:29,252 DEBUG pc_ptt.py : main [line: 191] call success
2023-07-11 11:49:37,790 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689003950.A.80A.html
2023-07-11 11:49:40,005 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689005722.A.1F9.html
2023-07-11 11:49:42,063 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689006932.A.34C.html
2023-07-11 11:49:43,003 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:49:44,513 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689010356.A.067.html
2023-07-11 11:49:44,624 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:49:46,074 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689011153.A.087.html
2023-07-11 11:49:46,196 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:49:47,659 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689031313.A.76C.html
2023-07-11 11:49:49,136 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689032073.A.5BE.html
2023-07-11 11:49:51,904 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689035402.A.885.html
2023-07-11 11:49:53,586 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 11:40:57,139 DEBUG pc_ptt.py : reptile [line: 152] save file success
2023-07-11 12:37:47,838 DEBUG pc_ptt.py : reptile [line: 154] file_path:C:\Users\desktop\Desktop\network-assets-admin\network-assets-reptile\reptile_data\ptt\1689046857.json
2023-07-11 12:40:39,165 DEBUG pc_ptt.py : main [line: 192] call success
2023-07-11 12:40:50,555 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689003950.A.80A.html
2023-07-11 12:40:52,547 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689005722.A.1F9.html
2023-07-11 12:40:54,724 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689006932.A.34C.html
2023-07-11 12:40:54,816 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 12:40:56,600 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689010356.A.067.html
2023-07-11 12:40:56,679 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 12:40:58,154 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689011153.A.087.html
2023-07-11 12:40:58,237 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
2023-07-11 12:40:59,577 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689031313.A.76C.html
2023-07-11 12:41:01,097 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689032073.A.5BE.html
2023-07-11 12:41:03,688 DEBUG pc_ptt.py : reptile [line: 90] 网页链接https://www.ptt.cc/bbs/Stock/M.1689035402.A.885.html
2023-07-11 12:41:04,316 DEBUG pc_ptt.py : reptile [line: 130] 未找到匹配的字符串
...@@ -61,17 +61,15 @@ def write_to_database(data): ...@@ -61,17 +61,15 @@ def write_to_database(data):
def reptile(browser=None, search_word=""): def reptile(browser=None, search_word=""):
url = "https://www.ptt.cc/bbs/hotboards.html" url = "https://www.ptt.cc/bbs/hotboards.html"
browser = browser or create(['--headless']) browser = browser or create(['--headless'])
# browser = browser or create()
# time.sleep(1)
# 打开网页 # 打开网页
browser.get(url) browser.get(url)
log.debug("已打开浏览器")
classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']") classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# log.debug(classify_item_list) # log.debug(classify_item_list)
length = len(classify_item_list) length = len(classify_item_list)
for index in range(length): for index in range(length):
if 1 < index < 3: if 0 < index < 2:
classify_item_list[index].click() classify_item_list[index].click()
# if index==0:
time.sleep(1) time.sleep(1)
element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a") element_list = browser.find_elements('xpath', "//div[@class='r-ent']//div[@class='title']//a")
length_two = len(element_list) length_two = len(element_list)
...@@ -104,35 +102,61 @@ def reptile(browser=None, search_word=""): ...@@ -104,35 +102,61 @@ def reptile(browser=None, search_word=""):
# ------------------------------------ # ------------------------------------
# 使用BeautifulSoup解析HTML # 使用BeautifulSoup解析HTML
soup = BeautifulSoup(element_content.get_attribute('innerHTML'), 'html.parser') soup = BeautifulSoup(element_content.get_attribute('innerHTML'), 'html.parser')
# 作者
element_author = browser.find_element('xpath',
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]")
log.debug('开始判断类型')
# ---------------- 判断类型 start ----------
# 查找所有img标签
img_tags = soup.find_all('img')
# 类型
content_type = ""
if len(img_tags) > 0:
content_type = "图文"
else:
content_type = "文字"
# ---------------- 判断类型 end ----------
log.debug('开始内容过滤')
# ------------------ content 过滤 start--------------
# 查找所有的<a>标签 # 查找所有的<a>标签
a_tags = soup.find_all('a', href=True) a_tags = soup.find_all('a', href=True)
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签 # 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for tag in a_tags: for tag in a_tags:
tag.decompose() tag.decompose()
# 找到所有第一级标签为 `div` 的元素
div_elements = soup.find_all('div')
# 逐个删除这些元素
for div in div_elements:
div.extract()
# 删除第一级span
span_element = soup.find_all('span')
for span in span_element:
span.extract()
html = soup.prettify().replace('amp;', '') html = soup.prettify().replace('amp;', '')
# log.debug(html) print(html)
# log.debug('11111') print("aaaaa")
# ------------------------------------ # ------------------ content 过滤 end--------------
# 组装数据
# --------------- 组装数据 start---------------------
obj = { obj = {
"title": element_title.text, "title": element_title.text,
"content": html, "content": html,
"link": browser_current_url, "link": browser_current_url,
"reptileTime": str(int(time.time())) "reptileTime": str(int(time.time())),
"type": content_type,
"author": element_author.text
} }
# --------------- 组装数据 end---------------------
# ------------------------------------------------------ # data.append(obj)
data.append(obj) # 使用正则表达式进行匹配
# # 使用正则表达式进行匹配 matches = re.findall(search_word, element_title.text)
# matches = re.findall(search_word, element_title.text) # 打印匹配结果
# # 打印匹配结果 if matches:
# if matches: # log.debug(f"找到了匹配的字符串:{matches}")
# # log.debug(f"找到了匹配的字符串:{matches}") data.append(obj)
# data.append(obj) else:
# else: log.debug("未找到匹配的字符串")
# log.debug("未找到匹配的字符串")
# ------------------------------------------------------
# 浏览器返回上一页 # 浏览器返回上一页
browser.back() browser.back()
...@@ -185,11 +209,6 @@ def convert_to_traditional(simplified_text): ...@@ -185,11 +209,6 @@ def convert_to_traditional(simplified_text):
return traditional_text return traditional_text
# 全局变量
data = []
table_name = "pms_ptt"
def main(): def main():
# 请求关键词 # 请求关键词
response = getReptileTask() response = getReptileTask()
...@@ -208,4 +227,8 @@ def main(): ...@@ -208,4 +227,8 @@ def main():
# upload_control() # upload_control()
# 全局变量
data = []
table_name = "pms_ptt"
# 调用main函数
main() main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment