Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
992c7fbb
Commit
992c7fbb
authored
Jul 11, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:爬取数据入库
parent
0c402bd4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
62 additions
and
29 deletions
+62
-29
app.log
app.log
+25
-0
pc_ptt.py
pc_ptt.py
+27
-29
.gitkeep
user_data/.gitkeep
+0
-0
createBrowserDriver.py
utils/createBrowserDriver.py
+10
-0
No files found.
app.log
View file @
992c7fbb
...
@@ -282,3 +282,28 @@
...
@@ -282,3 +282,28 @@
2023-07-10 20:07:10,800 DEBUG pc_ptt.py : reptile [line: 157] -----------------------------
2023-07-10 20:07:10,800 DEBUG pc_ptt.py : reptile [line: 157] -----------------------------
2023-07-10 20:07:10,800 DEBUG pc_ptt.py : reptile [line: 160] 文件保存成功
2023-07-10 20:07:10,800 DEBUG pc_ptt.py : reptile [line: 160] 文件保存成功
2023-07-10 20:07:10,801 DEBUG pc_ptt.py : reptile [line: 162] 文件绝对路径:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688990830.json
2023-07-10 20:07:10,801 DEBUG pc_ptt.py : reptile [line: 162] 文件绝对路径:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688990830.json
2023-07-10 20:53:30,796 DEBUG pc_ptt.py : main [line: 191] call success
2023-07-10 20:53:37,765 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991749.A.198.html
2023-07-10 20:53:40,542 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991994.A.D59.html
2023-07-10 20:53:40,583 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:42,508 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991995.A.4CE.html
2023-07-10 20:53:42,546 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:44,395 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688991995.A.797.html
2023-07-10 20:53:44,433 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:45,741 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688992089.A.50C.html
2023-07-10 20:53:47,085 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688992768.A.CC7.html
2023-07-10 20:53:48,390 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993164.A.DB1.html
2023-07-10 20:53:48,428 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:49,690 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993204.A.E97.html
2023-07-10 20:53:49,727 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:51,003 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993233.A.0E7.html
2023-07-10 20:53:51,041 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:52,275 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1688993306.A.8D6.html
2023-07-10 20:53:52,314 DEBUG pc_ptt.py : reptile [line: 127] 未找到匹配的字符串
2023-07-10 20:53:53,630 DEBUG pc_ptt.py : reptile [line: 88] 网页链接https://www.ptt.cc/bbs/Baseball/M.1634212242.A.283.html
2023-07-10 20:53:54,783 DEBUG pc_ptt.py : reptile [line: 153] /Users/macosx/Desktop/项目文档/网络资产管理系统
2023-07-10 20:53:54,784 DEBUG filse.py : save_json [line: 8] ------save json start--------
2023-07-10 20:53:54,784 DEBUG filse.py : save_json [line: 14] 文件保存路径:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688993634.json
2023-07-10 20:53:54,786 DEBUG pc_ptt.py : reptile [line: 157] -----------------------------
2023-07-10 20:53:54,786 DEBUG pc_ptt.py : reptile [line: 160] save file success
2023-07-10 20:53:54,786 DEBUG pc_ptt.py : reptile [line: 162] file_path:/Users/macosx/Desktop/项目文档/网络资产管理系统/network-assets-reptile/reptile_data/ptt/1688993634.json
pc_ptt.py
View file @
992c7fbb
...
@@ -60,7 +60,8 @@ def write_to_database(data):
...
@@ -60,7 +60,8 @@ def write_to_database(data):
def
reptile
(
browser
=
None
,
search_word
=
""
):
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://www.ptt.cc/bbs/hotboards.html"
url
=
"https://www.ptt.cc/bbs/hotboards.html"
browser
=
browser
or
create
([
'--headless'
])
# browser = browser or create(['--headless'])
browser
=
browser
or
create
()
# time.sleep(1)
# time.sleep(1)
# 打开网页
# 打开网页
browser
.
get
(
url
)
browser
.
get
(
url
)
...
@@ -70,6 +71,7 @@ def reptile(browser=None, search_word=""):
...
@@ -70,6 +71,7 @@ def reptile(browser=None, search_word=""):
for
index
in
range
(
length
):
for
index
in
range
(
length
):
if
0
<
index
<
2
:
if
0
<
index
<
2
:
classify_item_list
[
index
]
.
click
()
classify_item_list
[
index
]
.
click
()
# if index==0:
time
.
sleep
(
1
)
time
.
sleep
(
1
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
length_two
=
len
(
element_list
)
length_two
=
len
(
element_list
)
...
@@ -91,6 +93,7 @@ def reptile(browser=None, search_word=""):
...
@@ -91,6 +93,7 @@ def reptile(browser=None, search_word=""):
element_title
=
browser
.
find_element
(
'xpath'
,
element_title
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
"//div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
except
:
except
:
log
.
error
(
"xpath 找不到元素://div[@id='main-content']/div[3]//span[@class='article-meta-value']"
)
# 浏览器返回上一页
# 浏览器返回上一页
browser
.
back
()
browser
.
back
()
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
element_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='r-ent']//div[@class='title']//a"
)
...
@@ -135,23 +138,10 @@ def reptile(browser=None, search_word=""):
...
@@ -135,23 +138,10 @@ def reptile(browser=None, search_word=""):
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
# 发送爬取数据到java服务
# 发送爬取数据到java服务
def
upload_control
():
if
len
(
data
)
>
0
:
# 定义表单数据
form_data
=
{
"tableName"
:
"pms_ptt"
,
}
# file = io.BytesIO(json.dumps(data).encode())
# response = importJson(file, form_data)
# if response['status_code'] == 200 and response['data']['code'] == 200:
# log.debug("调用成功")
# else:
# log.debug("调用失败")
# upload_control()
# upload_control()
# 保存json文件到本地
# 保存json文件到本地
log
.
debug
(
os
.
path
.
abspath
(
"../"
))
log
.
debug
(
os
.
path
.
abspath
(
"../"
))
file_dir
=
f
'{os.path.join(os.path.abspath("../"),"network-assets-reptile","reptile_data","ptt",str(int(time.time()))+
".json")}'
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "ptt", str(int(time.time())) +
".json")}'
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save
=
save_json
(
file_dir
,
data
)
state_save
=
save_json
(
file_dir
,
data
)
log
.
debug
(
"-----------------------------"
)
log
.
debug
(
"-----------------------------"
)
...
@@ -167,6 +157,14 @@ def reptile(browser=None, search_word=""):
...
@@ -167,6 +157,14 @@ def reptile(browser=None, search_word=""):
response
=
importJsonPath
(
form_data
)
response
=
importJsonPath
(
form_data
)
else
:
else
:
log
.
debug
(
'save file failed'
)
log
.
debug
(
'save file failed'
)
else
:
# 爬取数据为空
form_data
=
{
"path"
:
""
,
"tableName"
:
table_name
}
response
=
importJsonPath
(
form_data
)
# 关闭浏览器驱动
# 关闭浏览器驱动
# time.sleep(3)
# time.sleep(3)
browser
.
quit
()
browser
.
quit
()
...
...
user_data/.gitkeep
0 → 100644
View file @
992c7fbb
utils/createBrowserDriver.py
View file @
992c7fbb
import
os.path
import
sys
import
sys
from
selenium
import
webdriver
from
selenium
import
webdriver
from
selenium.webdriver.chrome.options
import
Options
from
selenium.webdriver.chrome.options
import
Options
...
@@ -16,6 +17,14 @@ def create(option=None):
...
@@ -16,6 +17,14 @@ def create(option=None):
if
option
is
not
None
:
if
option
is
not
None
:
for
value
in
option
:
for
value
in
option
:
chrome_options
.
add_argument
(
value
)
chrome_options
.
add_argument
(
value
)
# 启用浏览器的持久性会话,可以保存登录状态和Cookie
user_data_dir
=
os
.
path
.
join
(
os
.
path
.
abspath
(
"../"
),
'network-assets-reptile'
,
'user_data'
)
script
=
f
'--user-data-dir={user_data_dir}'
print
(
script
)
log
.
debug
(
script
)
chrome_options
.
add_argument
(
script
)
# 设置一个自定义的用户配置文件路径
if
sys
.
platform
.
startswith
(
'linux'
):
if
sys
.
platform
.
startswith
(
'linux'
):
# print("当前系统是 Linux")
# print("当前系统是 Linux")
# linux下运行记得加上这些参数 ----------------------------
# linux下运行记得加上这些参数 ----------------------------
...
@@ -36,4 +45,5 @@ def create(option=None):
...
@@ -36,4 +45,5 @@ def create(option=None):
chrome_options
.
add_argument
(
'--no-sandbox'
)
# 禁用沙盒模式
chrome_options
.
add_argument
(
'--no-sandbox'
)
# 禁用沙盒模式
# 创建浏览器驱动对象
# 创建浏览器驱动对象
browser
=
webdriver
.
Chrome
(
options
=
chrome_options
)
browser
=
webdriver
.
Chrome
(
options
=
chrome_options
)
return
browser
return
browser
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment