Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
ab99c057
Commit
ab99c057
authored
Jul 13, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:ptt 爬取加速
parent
51625cb9
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
129 additions
and
108 deletions
+129
-108
error.log
log/error.log
+11
-0
pc_ptt.py
pc_ptt.py
+118
-108
No files found.
log/error.log
View file @
ab99c057
2023-07-11 20:04:32,430 ERROR pc_ptt.py : reptile [line: 94] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:23:47,713 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:23:51,168 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:24:12,330 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:27:18,984 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:28:31,234 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:30:08,742 ERROR pc_ptt.py : reptile [line: 64] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:32:45,950 ERROR pc_ptt.py : reptile [line: 65] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:35:08,341 ERROR pc_ptt.py : reptile [line: 65] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:39:23,710 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:41:30,332 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:43:37,394 ERROR pc_ptt.py : reptile [line: 67] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
pc_ptt.py
View file @
ab99c057
...
...
@@ -38,126 +38,136 @@ def reptile(browser=None, search_word=""):
length
=
len
(
classify_item_list
)
for
index
in
range
(
length
):
# 暂时先爬取 第2个 分类
if
1
<
index
<
3
:
if
0
<
index
<
4
:
type_title
=
classify_item_list
[
index
]
.
text
classify_item_list
[
index
]
.
click
()
#
time.sleep(0.1)
time
.
sleep
(
0.1
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
length_two
=
len
(
element_list
)
for
index_two
in
range
(
length_two
):
# 标题不包含"公告"
# 使用正则表达式进行匹配
# matches = re.findall("公告", element_list[index_two].text)
# log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果
# if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
element_list
[
index_two
]
.
click
()
# time.sleep(0.1)
# 原链接
browser_current_url
=
browser
.
current_url
log
.
debug
(
'网页链接'
+
str
(
browser_current_url
))
try
:
# 获取帖子详情
element_title
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
except
:
log
.
error
(
"xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
# 浏览器返回上一页
browser
.
back
()
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
break
# 内容可能包含图片和视频,需要后处理
element_content
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']"
)
# 去除herf属性值包含'img'的a标签
# ------------------------------------
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
element_content
.
get_attribute
(
'innerHTML'
),
'html.parser'
)
# 作者
element_author
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]"
)
# 发布时间
element_release
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]"
)
date_string
=
element_release
.
text
date_format
=
"
%
a
%
b
%
d
%
H:
%
M:
%
S
%
Y"
# 将日期字符串转换为datetime对象
date_time
=
datetime
.
strptime
(
date_string
,
date_format
)
# 将datetime对象转换为时间戳(以秒为单位)
release_time
=
int
(
date_time
.
timestamp
())
log
.
debug
(
'开始判断类型'
)
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
try
:
# 查找所有img标签
img_tags
=
soup
.
find_all
(
'img'
)
if
len
(
img_tags
)
>
0
:
content_type
=
"图文"
else
:
content_type
=
"文字"
except
:
content_type
=
"文字"
# ---------------- 判断类型 end ----------
log
.
debug
(
'开始内容过滤'
)
# ------------------ content 过滤 start--------------
try
:
# 查找所有的<a>标签
a_tags
=
soup
.
find_all
(
'a'
,
href
=
True
)
log
.
debug
(
"a标签数量:"
+
str
(
len
(
a_tags
)))
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for
tag
in
a_tags
:
tag
.
decompose
()
except
:
log
.
debug
(
"查找所有的<a>标签失败"
)
# 标题不包含"公告"和"看板"
if
re
.
findall
(
"公告"
,
element_list
[
index_two
]
.
text
)
or
re
.
findall
(
"看板"
,
element_list
[
index_two
]
.
text
):
a
=
1
else
:
log
.
debug
(
f
"正在爬取分类:{type_title}-第{index_two + 1}条"
)
# 使用正则表达式进行匹配
# matches =
# log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果
# if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
element_list
[
index_two
]
.
click
()
time
.
sleep
(
0.1
)
# 原链接
browser_current_url
=
browser
.
current_url
# log.debug('网页链接' + str(browser_current_url))
try
:
# 获取帖子详情
element_title
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
try
:
# 找到所有第一级标签为 `div` 的元素
div_elements
=
soup
.
find_all
(
'div'
)
log
.
debug
(
"一级div数量:"
+
str
(
len
(
div_elements
)))
# 逐个删除这些元素
for
div
in
div_elements
:
div
.
extract
()
# 删除第一级span
span_element
=
soup
.
find_all
(
'span'
)
log
.
debug
(
"一级span数量:"
+
str
(
len
(
span_element
)))
for
span
in
span_element
:
span
.
extract
()
except
:
log
.
error
(
"xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
log
.
debug
(
f
'页面链接:{browser_current_url}'
)
# 浏览器返回上一页
browser
.
back
()
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
break
# 内容可能包含图片和视频,需要后处理
element_content
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']"
)
# 去除herf属性值包含'img'的a标签
# ------------------------------------
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
element_content
.
get_attribute
(
'innerHTML'
),
'html.parser'
)
# 作者
element_author
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]"
)
# 发布时间
element_release
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]"
)
date_string
=
element_release
.
text
date_format
=
"
%
a
%
b
%
d
%
H:
%
M:
%
S
%
Y"
# 将日期字符串转换为datetime对象
date_time
=
datetime
.
strptime
(
date_string
,
date_format
)
# 将datetime对象转换为时间戳(以秒为单位)
release_time
=
int
(
date_time
.
timestamp
())
# log.debug('开始判断类型')
except
:
log
.
debug
(
"删除第一级div失败"
)
html
=
soup
.
prettify
()
.
replace
(
'amp;'
,
''
)
# ------------------ content 过滤 end--------------
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
try
:
# 查找所有img标签
img_tags
=
soup
.
find_all
(
'img'
)
if
len
(
img_tags
)
>
0
:
content_type
=
"图文"
else
:
content_type
=
"文字"
except
:
content_type
=
"文字"
# ---------------- 判断类型 end ----------
# log.debug('开始内容过滤')
# ------------------ content 过滤 start--------------
try
:
# 查找所有的<a>标签
a_tags
=
soup
.
find_all
(
'a'
,
href
=
True
)
# log.debug("a标签数量:" + str(len(a_tags)))
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for
tag
in
a_tags
:
tag
.
decompose
()
except
:
# log.debug("查找所有的<a>标签失败")
a
=
1
try
:
# 找到所有第一级标签为 `div` 的元素
div_elements
=
soup
.
find_all
(
'div'
)
# log.debug("一级div数量:" + str(len(div_elements)))
# 逐个删除这些元素
for
div
in
div_elements
:
div
.
extract
()
# 删除第一级span
span_element
=
soup
.
find_all
(
'span'
)
# log.debug("一级span数量:" + str(len(span_element)))
for
span
in
span_element
:
span
.
extract
()
# --------------- 组装数据 start---------------------
obj
=
{
"title"
:
element_title
.
text
,
"content"
:
html
,
"link"
:
browser_current_url
,
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"author"
:
element_author
.
text
,
"releaseTime"
:
release_time
}
# --------------- 组装数据 end---------------------
except
:
# log.debug("删除第一级div失败")
a
=
2
html
=
soup
.
prettify
()
.
replace
(
'amp;'
,
''
)
# ------------------ content 过滤 end--------------
if
search_word
==
""
:
data
.
append
(
obj
)
else
:
# 使用正则表达式进行匹配
matches
=
re
.
findall
(
search_word
,
element_title
.
text
)
# 打印匹配结果
if
matches
:
# log.debug(f"找到了匹配的字符串:{matches}")
# --------------- 组装数据 start---------------------
obj
=
{
"title"
:
element_title
.
text
,
"content"
:
html
,
"link"
:
browser_current_url
,
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"author"
:
element_author
.
text
,
"releaseTime"
:
release_time
}
# --------------- 组装数据 end---------------------
if
search_word
is
None
or
search_word
==
str
(
search_word
):
data
.
append
(
obj
)
else
:
log
.
debug
(
"未找到匹配的字符串"
)
# 浏览器返回上一页
browser
.
back
()
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
# 使用正则表达式进行匹配
# log.debug(f"关键词:{search_word}-{element_title.text}")
matches
=
re
.
findall
(
search_word
,
element_title
.
text
)
# 打印匹配结果
if
matches
:
# log.debug(f"找到了匹配的字符串:{matches}")
data
.
append
(
obj
)
else
:
# log.debug("未找到匹配的字符串")
a
=
3
# 浏览器返回上一页
browser
.
back
()
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
# 浏览器返回上一页
browser
.
back
()
# time.sleep(
1)
time
.
sleep
(
0.
1
)
# 重新获取
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment