Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
ab99c057
Commit
ab99c057
authored
Jul 13, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:ptt 爬取加速
parent
51625cb9
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
129 additions
and
108 deletions
+129
-108
error.log
log/error.log
+11
-0
pc_ptt.py
pc_ptt.py
+118
-108
No files found.
log/error.log
View file @
ab99c057
2023-07-11 20:04:32,430 ERROR pc_ptt.py : reptile [line: 94] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-11 20:04:32,430 ERROR pc_ptt.py : reptile [line: 94] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:23:47,713 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:23:51,168 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:24:12,330 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:27:18,984 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:28:31,234 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:30:08,742 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:32:45,950 ERROR pc_ptt.py : reptile [line: 65] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:35:08,341 ERROR pc_ptt.py : reptile [line: 65] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:39:23,710 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:41:30,332 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:43:37,394 ERROR pc_ptt.py : reptile [line: 67] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
pc_ptt.py
View file @
ab99c057
...
@@ -38,30 +38,37 @@ def reptile(browser=None, search_word=""):
...
@@ -38,30 +38,37 @@ def reptile(browser=None, search_word=""):
length
=
len
(
classify_item_list
)
length
=
len
(
classify_item_list
)
for
index
in
range
(
length
):
for
index
in
range
(
length
):
# 暂时先爬取 第2个 分类
# 暂时先爬取 第2个 分类
if
1
<
index
<
3
:
if
0
<
index
<
4
:
type_title
=
classify_item_list
[
index
]
.
text
classify_item_list
[
index
]
.
click
()
classify_item_list
[
index
]
.
click
()
#
time.sleep(0.1)
time
.
sleep
(
0.1
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
length_two
=
len
(
element_list
)
length_two
=
len
(
element_list
)
for
index_two
in
range
(
length_two
):
for
index_two
in
range
(
length_two
):
# 标题不包含"公告"
# 标题不包含"公告"和"看板"
if
re
.
findall
(
"公告"
,
element_list
[
index_two
]
.
text
)
or
re
.
findall
(
"看板"
,
element_list
[
index_two
]
.
text
):
a
=
1
else
:
log
.
debug
(
f
"正在爬取分类:{type_title}-第{index_two + 1}条"
)
# 使用正则表达式进行匹配
# 使用正则表达式进行匹配
# matches = re.findall("公告", element_list[index_two].text)
# matches =
# log.debug(element_list[index_two].text+str(matches))
# log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果
# 打印匹配结果
# if matches:
# if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
# log.debug(f"找到了匹配的字符串:{matches}")
element_list
[
index_two
]
.
click
()
element_list
[
index_two
]
.
click
()
#
time.sleep(0.1)
time
.
sleep
(
0.1
)
# 原链接
# 原链接
browser_current_url
=
browser
.
current_url
browser_current_url
=
browser
.
current_url
log
.
debug
(
'网页链接'
+
str
(
browser_current_url
))
#
log.debug('网页链接' + str(browser_current_url))
try
:
try
:
# 获取帖子详情
# 获取帖子详情
element_title
=
browser
.
find_element
(
'xpath'
,
element_title
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
except
:
except
:
log
.
error
(
"xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
log
.
error
(
"xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
log
.
debug
(
f
'页面链接:{browser_current_url}'
)
# 浏览器返回上一页
# 浏览器返回上一页
browser
.
back
()
browser
.
back
()
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
...
@@ -84,7 +91,8 @@ def reptile(browser=None, search_word=""):
...
@@ -84,7 +91,8 @@ def reptile(browser=None, search_word=""):
date_time
=
datetime
.
strptime
(
date_string
,
date_format
)
date_time
=
datetime
.
strptime
(
date_string
,
date_format
)
# 将datetime对象转换为时间戳(以秒为单位)
# 将datetime对象转换为时间戳(以秒为单位)
release_time
=
int
(
date_time
.
timestamp
())
release_time
=
int
(
date_time
.
timestamp
())
log
.
debug
(
'开始判断类型'
)
# log.debug('开始判断类型')
# ---------------- 判断类型 start ----------
# ---------------- 判断类型 start ----------
# 类型
# 类型
content_type
=
""
content_type
=
""
...
@@ -98,33 +106,34 @@ def reptile(browser=None, search_word=""):
...
@@ -98,33 +106,34 @@ def reptile(browser=None, search_word=""):
except
:
except
:
content_type
=
"文字"
content_type
=
"文字"
# ---------------- 判断类型 end ----------
# ---------------- 判断类型 end ----------
log
.
debug
(
'开始内容过滤'
)
#
log.debug('开始内容过滤')
# ------------------ content 过滤 start--------------
# ------------------ content 过滤 start--------------
try
:
try
:
# 查找所有的<a>标签
# 查找所有的<a>标签
a_tags
=
soup
.
find_all
(
'a'
,
href
=
True
)
a_tags
=
soup
.
find_all
(
'a'
,
href
=
True
)
log
.
debug
(
"a标签数量:"
+
str
(
len
(
a_tags
)))
#
log.debug("a标签数量:" + str(len(a_tags)))
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for
tag
in
a_tags
:
for
tag
in
a_tags
:
tag
.
decompose
()
tag
.
decompose
()
except
:
except
:
log
.
debug
(
"查找所有的<a>标签失败"
)
#
log.debug("查找所有的<a>标签失败")
a
=
1
try
:
try
:
# 找到所有第一级标签为 `div` 的元素
# 找到所有第一级标签为 `div` 的元素
div_elements
=
soup
.
find_all
(
'div'
)
div_elements
=
soup
.
find_all
(
'div'
)
log
.
debug
(
"一级div数量:"
+
str
(
len
(
div_elements
)))
#
log.debug("一级div数量:" + str(len(div_elements)))
# 逐个删除这些元素
# 逐个删除这些元素
for
div
in
div_elements
:
for
div
in
div_elements
:
div
.
extract
()
div
.
extract
()
# 删除第一级span
# 删除第一级span
span_element
=
soup
.
find_all
(
'span'
)
span_element
=
soup
.
find_all
(
'span'
)
log
.
debug
(
"一级span数量:"
+
str
(
len
(
span_element
)))
#
log.debug("一级span数量:" + str(len(span_element)))
for
span
in
span_element
:
for
span
in
span_element
:
span
.
extract
()
span
.
extract
()
except
:
except
:
log
.
debug
(
"删除第一级div失败"
)
# log.debug("删除第一级div失败")
a
=
2
html
=
soup
.
prettify
()
.
replace
(
'amp;'
,
''
)
html
=
soup
.
prettify
()
.
replace
(
'amp;'
,
''
)
# ------------------ content 过滤 end--------------
# ------------------ content 过滤 end--------------
...
@@ -140,24 +149,25 @@ def reptile(browser=None, search_word=""):
...
@@ -140,24 +149,25 @@ def reptile(browser=None, search_word=""):
}
}
# --------------- 组装数据 end---------------------
# --------------- 组装数据 end---------------------
if
search_word
==
""
:
if
search_word
is
None
or
search_word
==
str
(
search_word
)
:
data
.
append
(
obj
)
data
.
append
(
obj
)
else
:
else
:
# 使用正则表达式进行匹配
# 使用正则表达式进行匹配
# log.debug(f"关键词:{search_word}-{element_title.text}")
matches
=
re
.
findall
(
search_word
,
element_title
.
text
)
matches
=
re
.
findall
(
search_word
,
element_title
.
text
)
# 打印匹配结果
# 打印匹配结果
if
matches
:
if
matches
:
# log.debug(f"找到了匹配的字符串:{matches}")
# log.debug(f"找到了匹配的字符串:{matches}")
data
.
append
(
obj
)
data
.
append
(
obj
)
else
:
else
:
log
.
debug
(
"未找到匹配的字符串"
)
#
log.debug("未找到匹配的字符串")
a
=
3
# 浏览器返回上一页
# 浏览器返回上一页
browser
.
back
()
browser
.
back
()
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
# 浏览器返回上一页
# 浏览器返回上一页
browser
.
back
()
browser
.
back
()
# time.sleep(
1)
time
.
sleep
(
0.
1
)
# 重新获取
# 重新获取
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment