Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
d6d072eb
Commit
d6d072eb
authored
Jul 11, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:爬取数据入库
parent
3769935a
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
49 additions
and
168 deletions
+49
-168
app.log
app.log
+0
-142
pc_ptt.py
pc_ptt.py
+49
-26
No files found.
app.log
View file @
d6d072eb
This diff is collapsed.
Click to expand it.
pc_ptt.py
View file @
d6d072eb
...
...
@@ -61,17 +61,15 @@ def write_to_database(data):
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://www.ptt.cc/bbs/hotboards.html"
browser
=
browser
or
create
([
'--headless'
])
# browser = browser or create()
# time.sleep(1)
# 打开网页
browser
.
get
(
url
)
log
.
debug
(
"已打开浏览器"
)
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
# log.debug(classify_item_list)
length
=
len
(
classify_item_list
)
for
index
in
range
(
length
):
if
1
<
index
<
3
:
if
0
<
index
<
2
:
classify_item_list
[
index
]
.
click
()
# if index==0:
time
.
sleep
(
1
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
length_two
=
len
(
element_list
)
...
...
@@ -104,35 +102,61 @@ def reptile(browser=None, search_word=""):
# ------------------------------------
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
element_content
.
get_attribute
(
'innerHTML'
),
'html.parser'
)
# 作者
element_author
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]"
)
log
.
debug
(
'开始判断类型'
)
# ---------------- 判断类型 start ----------
# 查找所有img标签
img_tags
=
soup
.
find_all
(
'img'
)
# 类型
content_type
=
""
if
len
(
img_tags
)
>
0
:
content_type
=
"图文"
else
:
content_type
=
"文字"
# ---------------- 判断类型 end ----------
log
.
debug
(
'开始内容过滤'
)
# ------------------ content 过滤 start--------------
# 查找所有的<a>标签
a_tags
=
soup
.
find_all
(
'a'
,
href
=
True
)
# 循环遍历<a>标签,检查每个<a>标签是否包含<img>元素,如果包含则删除该<a>标签
for
tag
in
a_tags
:
tag
.
decompose
()
# 找到所有第一级标签为 `div` 的元素
div_elements
=
soup
.
find_all
(
'div'
)
# 逐个删除这些元素
for
div
in
div_elements
:
div
.
extract
()
# 删除第一级span
span_element
=
soup
.
find_all
(
'span'
)
for
span
in
span_element
:
span
.
extract
()
html
=
soup
.
prettify
()
.
replace
(
'amp;'
,
''
)
# log.debug(html)
# log.debug('11111')
# ------------------------------------
# 组装数据
print
(
html
)
print
(
"aaaaa"
)
# ------------------ content 过滤 end--------------
# --------------- 组装数据 start---------------------
obj
=
{
"title"
:
element_title
.
text
,
"content"
:
html
,
"link"
:
browser_current_url
,
"reptileTime"
:
str
(
int
(
time
.
time
()))
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"author"
:
element_author
.
text
}
# --------------- 组装数据 end---------------------
# ------------------------------------------------------
data
.
append
(
obj
)
# # 使用正则表达式进行匹配
# matches = re.findall(search_word, element_title.text)
# # 打印匹配结果
# if matches:
# # log.debug(f"找到了匹配的字符串:{matches}")
# data.append(obj)
# else:
# log.debug("未找到匹配的字符串")
# ------------------------------------------------------
# data.append(obj)
# 使用正则表达式进行匹配
matches
=
re
.
findall
(
search_word
,
element_title
.
text
)
# 打印匹配结果
if
matches
:
# log.debug(f"找到了匹配的字符串:{matches}")
data
.
append
(
obj
)
else
:
log
.
debug
(
"未找到匹配的字符串"
)
# 浏览器返回上一页
browser
.
back
()
...
...
@@ -185,11 +209,6 @@ def convert_to_traditional(simplified_text):
return
traditional_text
# 全局变量
data
=
[]
table_name
=
"pms_ptt"
def
main
():
# 请求关键词
response
=
getReptileTask
()
...
...
@@ -208,4 +227,8 @@ def main():
# upload_control()
# 全局变量
data
=
[]
table_name
=
"pms_ptt"
# 调用main函数
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment