Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
dae3fe3d
Commit
dae3fe3d
authored
Aug 09, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:ptt debug
parent
27018e2d
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
185 additions
and
149 deletions
+185
-149
error.log
log/error.log
+9
-0
pc_ptt.py
pc_ptt.py
+176
-149
No files found.
log/error.log
View file @
dae3fe3d
...
...
@@ -11,3 +11,12 @@
2023-07-13 16:41:30,332 ERROR pc_ptt.py : reptile [line: 66] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-13 16:43:37,394 ERROR pc_ptt.py : reptile [line: 67] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-07-21 10:54:17,501 ERROR pc_ptt.py : reptile [line: 73] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:32,527 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:41,957 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:41:43,433 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:51:10,728 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:52:41,156 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:58:54,782 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:59:04,220 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 15:59:27,844 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
2023-08-09 16:00:02,916 ERROR pc_ptt.py : reptile [line: 120] xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']
pc_ptt.py
View file @
dae3fe3d
...
...
@@ -8,6 +8,9 @@ import loguru
import
requests
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
from
selenium.common
import
NoSuchElementException
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.Logger
import
log
from
utils.index
import
convert_to_traditional
,
create_directory_if_not_exists
,
delete_directory
...
...
@@ -42,8 +45,8 @@ def reptile(browser=None, search_word=""):
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
for
index
,
item_element
in
enumerate
(
classify_item_list
):
#
暂时先爬取 第2个
分类
if
0
<=
index
<
=
14
:
#
只爬取综合
分类
if
0
<=
index
<
1
:
type_title
=
classify_item_list
[
index
]
.
text
# 进入分类页面
classify_item_list
[
index
]
.
click
()
...
...
@@ -59,164 +62,188 @@ def reptile(browser=None, search_word=""):
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@class='r-ent']"
)))
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
page
=
1
for
index_two
,
item
in
enumerate
(
element_list
):
# print(element_list[index_two].text)
try
:
re
.
findall
(
"公告"
,
item
.
text
)
except
IndexError
:
log
.
debug
(
f
"正在爬取分类:{type_title}-第{index_two + 1}条"
)
print
(
"当前连接:"
+
str
(
browser
.
current_url
))
print
(
data
[
len
(
data
)
-
1
][
"title"
])
# 使用正则表达式进行匹配关键词
if
re
.
findall
(
search_word
,
item
.
text
):
# log.debug(f"找到了匹配的字符串:{matches}")
error
=
""
else
:
# 本次迭代帖子标题未匹配关键词,退出本次迭代,进入下一次迭代
continue
# element_meta_list = browser.find_elements("xpath", "//div[@class='r-ent']//div[@class='meta']")
# 标题不包含"公告"和"看板"
if
re
.
findall
(
"公告"
,
element_list
[
index_two
]
.
text
)
or
re
.
findall
(
"看板"
,
element_list
[
index_two
]
.
text
):
a
=
1
else
:
# 使用正则表达式进行匹配
# matches =
# log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果
# if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
element_list
[
index_two
]
.
click
()
time
.
sleep
(
0.1
)
# 原链接
browser_current_url
=
browser
.
current_url
# print(browser_current_url)
# log.debug('网页链接' + str(browser_current_url))
def
process_data
():
# 增加搜索
search_input
=
browser
.
find_element
(
"xpath"
,
"//div[@class='search-bar']//input"
)
if
search_word
!=
search_input
.
get_attribute
(
"value"
):
# 输入搜索关键词
search_input
.
send_keys
(
search_word
)
# 点击输入框
# browser.execute_script("arguments[0].click();", search_input)
# 确认搜索关键词
# search_input.click()
search_input
.
submit
()
# 程序睡眠300ms,等待页面加载完成
time
.
sleep
(
0.3
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
nonlocal
page
print
(
f
"当前为第:{page} 页,共 {len(element_list)} 条数据"
)
for
index_two
,
item
in
enumerate
(
element_list
):
# print(element_list[index_two].text)
try
:
# 获取帖子详情
element_title
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
re
.
findall
(
"公告"
,
item
.
text
)
except
IndexError
:
error
=
""
# log.debug(f"正在爬取分类:{type_title}-第{index_two + 1}条")
# print("当前连接:" + str(browser.current_url))
# print(data[len(data) - 1]["title"])
except
:
log
.
error
(
"xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
log
.
debug
(
f
'页面链接:{browser_current_url}'
)
# 浏览器返回上一页
browser
.
back
()
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
break
# 内容可能包含图片和视频,需要后处理
element_content
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']"
)
# 去除herf属性值包含'img'的a标签
# ------------------------------------
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
element_content
.
get_attribute
(
'outerHTML'
),
'html.parser'
)
# 作者
element_author
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]"
)
# 发布时间
element_release
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]"
)
# 标题不包含"公告"和"看板"
if
re
.
findall
(
"公告"
,
element_list
[
index_two
]
.
text
)
or
re
.
findall
(
"看板"
,
element_list
[
index_two
]
.
text
):
a
=
1
else
:
element_list
[
index_two
]
.
click
()
time
.
sleep
(
0.2
)
# 原链接
browser_current_url
=
browser
.
current_url
try
:
# 获取帖子详情
element_title
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
# log.debug('开始判断类型')
try
:
# 找到所有第一级标签为 `div` 的元素
div_elements
=
soup
.
find_all
(
'div'
)
# log.debug("一级div数量:" + str(len(div_elements)))
# 逐个删除这些元素
for
key
,
div
in
enumerate
(
div_elements
):
if
key
>
0
:
div
.
extract
()
# 删除第一级span
span_element
=
soup
.
find_all
(
'span'
)
# log.debug("一级span数量:" + str(len(span_element)))
for
span
in
span_element
:
span
.
extract
()
except
:
log
.
error
(
"xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
log
.
debug
(
f
'页面链接:{browser_current_url}'
)
# 浏览器返回上一页
browser
.
back
()
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
break
# 内容可能包含图片和视频,需要后处理
element_content
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']"
)
# 去除herf属性值包含'img'的a标签
# ------------------------------------
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
element_content
.
get_attribute
(
'outerHTML'
),
'html.parser'
)
# 作者
element_author
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]"
)
# 发布时间
element_release
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][3]/span[2]"
)
except
:
# log.debug("删除第一级div失败")
a
=
2
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
# 查找所有img标签
image_list
=
soup
.
find_all
(
'img'
)
try
:
# log.debug('开始判断类型')
try
:
# 找到所有第一级标签为 `div` 的元素
div_elements
=
soup
.
find_all
(
'div'
)
# log.debug("一级div数量:" + str(len(div_elements)))
# 逐个删除这些元素
for
key
,
div
in
enumerate
(
div_elements
):
if
key
>
0
:
div
.
extract
()
# 删除第一级span
span_element
=
soup
.
find_all
(
'span'
)
# log.debug("一级span数量:" + str(len(span_element)))
for
span
in
span_element
:
span
.
extract
()
except
:
# log.debug("删除第一级div失败")
a
=
2
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
# 查找所有img标签
image_list
=
soup
.
find_all
(
'img'
)
try
:
if
len
(
image_list
)
>
0
:
content_type
=
"图文"
else
:
content_type
=
"文字"
except
:
content_type
=
"文字"
picture_url
=
[]
if
len
(
image_list
)
>
0
:
content_type
=
"图文"
for
key
,
element
in
enumerate
(
image_list
):
# 下载图片至本地,替换标签中的src
id
=
str
(
int
(
time
.
time
()))
# 下载地址
download_dir
=
f
'{os.path.join(local_path, f"{id}.jpg")}'
# 访问地址
access_address
=
f
'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.jpg'
# 下载状态
status
=
download_image
(
element
[
'src'
],
download_dir
)
if
status
:
element
[
'src'
]
=
access_address
picture_url
.
append
(
download_dir
)
else
:
content_type
=
"文字"
except
:
content_type
=
"文字"
picture_url
=
[]
if
len
(
image_list
)
>
0
:
for
key
,
element
in
enumerate
(
image_list
):
# 下载图片至本地,替换标签中的src
id
=
str
(
int
(
time
.
time
()))
# 下载地址
download_dir
=
f
'{os.path.join(local_path, f"{id}.jpg")}'
# 访问地址
access_address
=
f
'{get_base_file_url()}{table_name.split("_")[1]}/{local_path_name}/{id}.jpg'
# 下载状态
status
=
download_image
(
element
[
'src'
],
download_dir
)
if
status
:
element
[
'src'
]
=
access_address
picture_url
.
append
(
download_dir
)
else
:
# print("")
error
=
""
# ---------------- 判断类型 end ----------
# log.debug('开始内容过滤')
# ------------------ content 过滤 start--------------
try
:
# 查找所有的<a>标签
a_tags
=
soup
.
find_all
(
'a'
,
href
=
True
)
# log.debug("a标签数量:" + str(len(a_tags)))
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for
tag
in
a_tags
:
tag
.
decompose
()
except
:
# log.debug("查找所有的<a>标签失败")
a
=
1
html
=
soup
.
prettify
()
.
replace
(
'amp;'
,
''
)
# ------------------ content 过滤 end--------------
# print("")
error
=
""
# ---------------- 判断类型 end ----------
# log.debug('开始内容过滤')
# ------------------ content 过滤 start--------------
try
:
# 查找所有的<a>标签
a_tags
=
soup
.
find_all
(
'a'
,
href
=
True
)
# log.debug("a标签数量:" + str(len(a_tags)))
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for
tag
in
a_tags
:
tag
.
decompose
()
except
:
# log.debug("查找所有的<a>标签失败")
a
=
1
html
=
soup
.
prettify
()
.
replace
(
'amp;'
,
''
)
# ------------------ content 过滤 end--------------
date_string
=
element_release
.
text
date_format
=
"
%
a
%
b
%
d
%
H:
%
M:
%
S
%
Y
"
# 将日期字符串转换为datetime对象
date_time
=
datetime
.
strptime
(
date_string
,
date_format
)
# 将datetime对象转换为时间戳(以秒为单位)
release_time
=
int
(
date_time
.
timestamp
()
)
date_string
=
element_release
.
text
# date_string = "Wed Aug 9 15:39:26 2023 //update 20934353
"
# 提取日期字符串
if
"//"
in
date_string
:
date_string
=
date_string
.
split
(
"//"
)[
0
]
date_string
=
date_string
.
strip
(
)
# 过滤时间
if
beginFiltrationTime
<=
release_time
<=
endFiltrationTime
:
# --------------- 组装数据 start---------------------
obj
=
{
"title"
:
element_title
.
text
,
"content"
:
html
,
"link"
:
browser_current_url
,
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"author"
:
element_author
.
text
,
"releaseTime"
:
str
(
release_time
),
"picture_url"
:
","
.
join
(
picture_url
)
}
# --------------- 组装数据 end---------------------
data
.
append
(
obj
)
# 浏览器返回上一页
browser
.
back
()
time
.
sleep
(
0.1
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
date_format
=
"
%
a
%
b
%
d
%
H:
%
M:
%
S
%
Y"
# 将日期字符串转换为datetime对象
date_time
=
datetime
.
strptime
(
date_string
,
date_format
)
# 将datetime对象转换为时间戳(以秒为单位)
release_time
=
int
(
date_time
.
timestamp
())
# print("循环结束")
# 浏览器返回上一页
browser
.
back
()
if
index
==
0
:
browser
.
back
()
time
.
sleep
(
0.1
)
# 重新获取
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
# 过滤时间
if
beginFiltrationTime
<=
release_time
<=
endFiltrationTime
:
# --------------- 组装数据 start---------------------
obj
=
{
"title"
:
element_title
.
text
,
"content"
:
html
,
"link"
:
browser_current_url
,
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"author"
:
element_author
.
text
,
"releaseTime"
:
str
(
release_time
),
"picture_url"
:
","
.
join
(
picture_url
)
}
# --------------- 组装数据 end---------------------
data
.
append
(
obj
)
# 浏览器返回上一页
browser
.
back
()
time
.
sleep
(
0.2
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
page
=
page
+
1
# print("111111")
try
:
prev_button
=
browser
.
find_element
(
'xpath'
,
"//a[@class='btn wide' and text() = '‹ 上頁']"
)
prev_button
.
click
()
time
.
sleep
(
0.3
)
process_data
()
except
:
error
=
""
process_data
()
# # print("循环结束")
# # 浏览器返回上一页
# browser.back()
# if index == 0:
# browser.back()
# time.sleep(0.1)
# # 重新获取
# classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# 发送爬取数据到java服务
# print('----------------------')
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment