Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
dae3fe3d
Commit
dae3fe3d
authored
Aug 09, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:ptt debug
parent
27018e2d
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
185 additions
and
149 deletions
+185
-149
error.log
log/error.log
+9
-0
pc_ptt.py
pc_ptt.py
+176
-149
No files found.
log/error.log
View file @
dae3fe3d
...
...
@@ -11,3 +11,12 @@
2023-07-13 16:41:30,332 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:43:37,394 ERROR pc_ptt.py : reptile [line: 67] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-21 10:54:17,501 ERROR pc_ptt.py : reptile [line: 73] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:32,527 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:41,957 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:43,433 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:51:10,728 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:52:41,156 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:58:54,782 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:59:04,220 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:59:27,844 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 16:00:02,916 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
pc_ptt.py
View file @
dae3fe3d
...
...
@@ -8,6 +8,9 @@ import loguru
import
requests
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
from
selenium.common
import
NoSuchElementException
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.Logger
import
log
from
utils.index
import
convert_to_traditional
,
create_directory_if_not_exists
,
delete_directory
...
...
@@ -42,8 +45,8 @@ def reptile(browser=None, search_word=""):
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
for
index
,
item_element
in
enumerate
(
classify_item_list
):
#
暂时先爬取 第2个
分类
if
0
<=
index
<
=
14
:
#
只爬取综合
分类
if
0
<=
index
<
1
:
type_title
=
classify_item_list
[
index
]
.
text
# 进入分类页面
classify_item_list
[
index
]
.
click
()
...
...
@@ -59,40 +62,46 @@ def reptile(browser=None, search_word=""):
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@class='r-ent']"
)))
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
page
=
1
# element_meta_list = browser.find_elements("xpath", "//div[@class='r-ent']//div[@class='meta']")
def
process_data
():
# 增加搜索
search_input
=
browser
.
find_element
(
"xpath"
,
"//div[@class='search-bar']//input"
)
if
search_word
!=
search_input
.
get_attribute
(
"value"
):
# 输入搜索关键词
search_input
.
send_keys
(
search_word
)
# 点击输入框
# browser.execute_script("arguments[0].click();", search_input)
# 确认搜索关键词
# search_input.click()
search_input
.
submit
()
# 程序睡眠300ms,等待页面加载完成
time
.
sleep
(
0.3
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
nonlocal
page
print
(
f
"当前为第:{page} 页,共 {len(element_list)} 条数据"
)
for
index_two
,
item
in
enumerate
(
element_list
):
# print(element_list[index_two].text)
try
:
re
.
findall
(
"公告"
,
item
.
text
)
except
IndexError
:
log
.
debug
(
f
"正在爬取分类:{type_title}-第{index_two + 1}条"
)
print
(
"当前连接:"
+
str
(
browser
.
current_url
))
print
(
data
[
len
(
data
)
-
1
][
"title"
])
# 使用正则表达式进行匹配关键词
if
re
.
findall
(
search_word
,
item
.
text
):
# log.debug(f"找到了匹配的字符串:{matches}")
error
=
""
else
:
# 本次迭代帖子标题未匹配关键词,退出本次迭代,进入下一次迭代
continue
# log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
# print("当前连接:" + str(browser.current_url))
# print(data[len(data) - 1]["title"])
# 标题不包含"公告"和"看板"
if
re
.
findall
(
"公告"
,
element_list
[
index_two
]
.
text
)
or
re
.
findall
(
"看板"
,
element_list
[
index_two
]
.
text
):
if
re
.
findall
(
"公告"
,
element_list
[
index_two
]
.
text
)
or
re
.
findall
(
"看板"
,
element_list
[
index_two
]
.
text
):
a
=
1
else
:
# 使用正则表达式进行匹配
# matches =
# log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果
# if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
element_list
[
index_two
]
.
click
()
time
.
sleep
(
0.1
)
time
.
sleep
(
0.2
)
# 原链接
browser_current_url
=
browser
.
current_url
# print(browser_current_url)
# log.debug('网页链接' + str(browser_current_url))
try
:
# 获取帖子详情
element_title
=
browser
.
find_element
(
'xpath'
,
...
...
@@ -104,7 +113,8 @@ def reptile(browser=None, search_word=""):
log
.
debug
(
f
'页面链接:{browser_current_url}'
)
# 浏览器返回上一页
browser
.
back
()
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
break
# 内容可能包含图片和视频,需要后处理
element_content
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']"
)
...
...
@@ -183,6 +193,12 @@ def reptile(browser=None, search_word=""):
# ------------------ content 过滤 end--------------
date_string
=
element_release
.
text
# date_string = "Wed Aug 9 15:39:26 2023 //update 20934353"
# 提取日期字符串
if
"//"
in
date_string
:
date_string
=
date_string
.
split
(
"//"
)[
0
]
date_string
=
date_string
.
strip
()
date_format
=
"
%
a
%
b
%
d
%
H:
%
M:
%
S
%
Y"
# 将日期字符串转换为datetime对象
date_time
=
datetime
.
strptime
(
date_string
,
date_format
)
...
...
@@ -206,17 +222,28 @@ def reptile(browser=None, search_word=""):
data
.
append
(
obj
)
# 浏览器返回上一页
browser
.
back
()
time
.
sleep
(
0.1
)
time
.
sleep
(
0.2
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
page
=
page
+
1
# print("111111")
try
:
prev_button
=
browser
.
find_element
(
'xpath'
,
"//a[@class='btn wide' and text() = '‹ 上頁']"
)
prev_button
.
click
()
time
.
sleep
(
0.3
)
process_data
()
except
:
error
=
""
# print("循环结束")
# 浏览器返回上一页
browser
.
back
()
if
index
==
0
:
browser
.
back
()
time
.
sleep
(
0.1
)
# 重新获取
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
process_data
()
# # print("循环结束")
# # 浏览器返回上一页
# browser.back()
# if index == 0:
# browser.back()
# time.sleep(0.1)
# # 重新获取
# classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# 发送爬取数据到java服务
# print('----------------------')
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment