Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
3a027fb6
Commit
3a027fb6
authored
Jul 10, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:调试数据入库
parent
8708ce28
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
81 additions
and
43 deletions
+81
-43
.gitignore
.gitignore
+1
-1
index.py
api/index.py
+16
-1
pc_ptt.py
pc_ptt.py
+64
-41
No files found.
.gitignore
View file @
3a027fb6
reptile_data/files
reptile_data/files
reptile_data/ptt/*.json
*.zip
*.zip
*.json
*.json
.DS_Store
.DS_Store
venv
venv
reptile_data
*.idea
*.idea
\ No newline at end of file
api/index.py
View file @
3a027fb6
...
@@ -11,8 +11,23 @@ def importJson(file, form_data):
...
@@ -11,8 +11,23 @@ def importJson(file, form_data):
# 将响应内容解析为 JSON 格式
# 将响应内容解析为 JSON 格式
return
{
"status_code"
:
response
.
status_code
,
"data"
:
response
.
json
()}
return
{
"status_code"
:
response
.
status_code
,
"data"
:
response
.
json
()}
def
getReptileTask
():
def
getReptileTask
():
http_url
=
baser_url
+
"crawlerSetting/list"
http_url
=
baser_url
+
"crawlerSetting/list"
response
=
requests
.
get
(
http_url
,
headers
=
headers
)
response
=
requests
.
get
(
http_url
,
headers
=
headers
)
# 将响应内容解析为 JSON 格式
# 将响应内容解析为 JSON 格式
return
{
"status_code"
:
response
.
status_code
,
"data"
:
response
.
json
()}
return
{
"status_code"
:
response
.
status_code
,
"data"
:
response
.
json
()}
def
importJsonPath
(
form_data
):
headers
=
{
"Content-Type"
:
"application/json"
}
http_url
=
baser_url
+
"importJson/importJsonPath"
response
=
requests
.
post
(
http_url
,
headers
=
headers
,
data
=
form_data
)
# 将响应内容解析为 JSON 格式
return
{
"status_code"
:
response
.
status_code
,
"data"
:
response
.
json
()}
def
runingPython
(
form_data
):
headers
=
{
"Content-Type"
:
"application/json"
}
http_url
=
baser_url
+
"python/startPy"
response
=
requests
.
post
(
http_url
,
headers
=
headers
,
data
=
form_data
)
# 将响应内容解析为 JSON 格式
return
{
"status_code"
:
response
.
status_code
,
"data"
:
response
.
json
()}
pc_ptt.py
View file @
3a027fb6
...
@@ -13,6 +13,8 @@ from utils.Logger import log
...
@@ -13,6 +13,8 @@ from utils.Logger import log
# from requests_toolbelt import *
# from requests_toolbelt import *
from
utils.createBrowserDriver
import
create
from
utils.createBrowserDriver
import
create
import
opencc
import
opencc
from
utils.filse
import
save_json
import
os
'''
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
...
@@ -20,8 +22,6 @@ import opencc
...
@@ -20,8 +22,6 @@ import opencc
爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
'''
'''
data
=
[]
def
write_to_database
(
data
):
def
write_to_database
(
data
):
# 连接到数据库
# 连接到数据库
...
@@ -74,14 +74,27 @@ def reptile(browser=None, search_word=""):
...
@@ -74,14 +74,27 @@ def reptile(browser=None, search_word=""):
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
length_two
=
len
(
element_list
)
length_two
=
len
(
element_list
)
for
index_two
in
range
(
length_two
):
for
index_two
in
range
(
length_two
):
# 标题不包含"公告"
# 使用正则表达式进行匹配
# matches = re.findall("公告", element_list[index_two].text)
# log.debug(element_list[index_two].text+str(matches))
# 打印匹配结果
# if matches:
# log.debug(f"找到了匹配的字符串:{matches}")
element_list
[
index_two
]
.
click
()
element_list
[
index_two
]
.
click
()
time
.
sleep
(
1
)
time
.
sleep
(
1
)
# 原链接
# 原链接
browser_current_url
=
browser
.
current_url
browser_current_url
=
browser
.
current_url
log
.
debug
(
'网页链接'
+
str
(
browser_current_url
))
log
.
debug
(
'网页链接'
+
str
(
browser_current_url
))
try
:
# 获取帖子详情
# 获取帖子详情
element_title
=
browser
.
find_element
(
'xpath'
,
element_title
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
except
:
# 浏览器返回上一页
browser
.
back
()
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
break
# 内容可能包含图片和视频,需要后处理
# 内容可能包含图片和视频,需要后处理
element_content
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']"
)
element_content
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']"
)
# 去除herf属性值包含'img'的a标签
# 去除herf属性值包含'img'的a标签
...
@@ -122,56 +135,66 @@ def reptile(browser=None, search_word=""):
...
@@ -122,56 +135,66 @@ def reptile(browser=None, search_word=""):
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
# 发送爬取数据到java服务
# 发送爬取数据到java服务
# 保存json文件到本地
file_dir
=
f
'./reptile_data/ptt/{int(time.time())}'
# state_save = save_json(file_dir, data)
def
upload_control
():
def
upload_control
():
# 定义表单数据
# 定义表单数据
form_data
=
{
form_data
=
{
"tableName"
:
"pms_ptt"
,
"tableName"
:
"pms_ptt"
,
}
}
file
=
io
.
BytesIO
(
json
.
dumps
(
data
)
.
encode
())
#
file = io.BytesIO(json.dumps(data).encode())
response
=
importJson
(
file
,
form_data
)
#
response = importJson(file, form_data)
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
#
if response['status_code'] == 200 and response['data']['code'] == 200:
log
.
debug
(
"调用成功"
)
#
log.debug("调用成功")
else
:
#
else:
log
.
debug
(
"调用失败"
)
#
log.debug("调用失败")
# upload_control()
# upload_control()
if
len
(
data
)
==
0
:
# upload_control()
log
.
debug
(
"数据为空"
)
# 保存json文件到本地
else
:
file_dir
=
f
'./reptile_data/ptt/{int(time.time())}'
upload_control
(
)
state_save
=
save_json
(
file_dir
,
data
)
# write_to_database(data)
# write_to_database(data)
# if state_save:
if
state_save
:
# log.debug('文件保存成功')
log
.
debug
(
'文件保存成功'
)
# else:
# log.debug('文件绝对路径:'+os.path.abspath(file_dir))
# log.debug('文件保存失败')
form_data
=
{
path
:
os
.
path
.
abspath
(
file_dir
)
+
".json"
,
tableName
:
table_name
}
response
=
importJsonPath
()
else
:
log
.
debug
(
'文件保存失败'
)
# 关闭浏览器驱动
# 关闭浏览器驱动
# time.sleep(3)
# time.sleep(3)
browser
.
quit
()
browser
.
quit
()
response
=
getReptileTask
()
def
convert_to_traditional
(
simplified_text
):
def
convert_to_traditional
(
simplified_text
):
converter
=
opencc
.
OpenCC
(
's2t.json'
)
# 创建简体中文到繁体中文的转换器
converter
=
opencc
.
OpenCC
(
's2t.json'
)
# 创建简体中文到繁体中文的转换器
traditional_text
=
converter
.
convert
(
simplified_text
)
# 进行转换
traditional_text
=
converter
.
convert
(
simplified_text
)
# 进行转换
return
traditional_text
return
traditional_text
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
# 全局变量
data
=
[]
table_name
=
""
def
main
():
# 请求关键词
response
=
getReptileTask
()
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"调用成功"
)
log
.
debug
(
"调用成功"
)
search_word
=
""
search_word
=
""
for
item
in
response
[
'data'
][
'rows'
]:
for
item
in
response
[
'data'
][
'rows'
]:
if
item
[
'name'
]
==
'ptt'
:
if
item
[
'name'
]
==
'ptt'
:
search_word
=
item
[
'keyword'
]
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
# print(convert_to_traditional(search_word))
# print(convert_to_traditional(search_word))
reptile
(
None
,
convert_to_traditional
(
search_word
))
reptile
(
None
,
convert_to_traditional
(
search_word
))
else
:
else
:
log
.
debug
(
"调用失败"
)
log
.
debug
(
"调用失败"
)
# upload_control()
# upload_control()
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment