Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
af6aac0d
Commit
af6aac0d
authored
Aug 04, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:task.json 新增total字段
parent
72c7ada4
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
87 additions
and
53 deletions
+87
-53
pc_dcard.py
pc_dcard.py
+2
-1
pc_facebook.py
pc_facebook.py
+2
-1
pc_instagram.py
pc_instagram.py
+2
-1
pc_ltn.py
pc_ltn.py
+2
-1
pc_ptt.py
pc_ptt.py
+3
-2
pc_twitter.py
pc_twitter.py
+2
-1
pc_youtube.py
pc_youtube.py
+2
-1
test.py
test.py
+72
-45
No files found.
pc_dcard.py
View file @
af6aac0d
...
@@ -183,7 +183,8 @@ def reptile(browser=None, search_word=""):
...
@@ -183,7 +183,8 @@ def reptile(browser=None, search_word=""):
"localPath"
:
local_path
,
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
"keyword"
:
keyword
,
"total"
:
len
(
data
)
}
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
if
state_save
:
...
...
pc_facebook.py
View file @
af6aac0d
...
@@ -186,7 +186,8 @@ def reptile(browser=None, search_word=""):
...
@@ -186,7 +186,8 @@ def reptile(browser=None, search_word=""):
"localPath"
:
local_path
,
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
"keyword"
:
keyword
,
"total"
:
len
(
data
)
}
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
if
state_save
:
...
...
pc_instagram.py
View file @
af6aac0d
...
@@ -173,7 +173,8 @@ def reptile(browser=None, search_word=""):
...
@@ -173,7 +173,8 @@ def reptile(browser=None, search_word=""):
"localPath"
:
local_path
,
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
"keyword"
:
keyword
,
"total"
:
len
(
data
)
}
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
if
state_save
:
...
...
pc_ltn.py
View file @
af6aac0d
...
@@ -156,7 +156,8 @@ def reptile(browser=None, search_word=""):
...
@@ -156,7 +156,8 @@ def reptile(browser=None, search_word=""):
"localPath"
:
local_path
,
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
"keyword"
:
keyword
,
"total"
:
len
(
data
)
}
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
if
state_save
:
...
...
pc_ptt.py
View file @
af6aac0d
...
@@ -43,7 +43,7 @@ def reptile(browser=None, search_word=""):
...
@@ -43,7 +43,7 @@ def reptile(browser=None, search_word=""):
for
index
,
item_element
in
enumerate
(
classify_item_list
):
for
index
,
item_element
in
enumerate
(
classify_item_list
):
# 暂时先爬取 第2个 分类
# 暂时先爬取 第2个 分类
if
0
<=
index
:
if
0
<=
index
<=
14
:
type_title
=
classify_item_list
[
index
]
.
text
type_title
=
classify_item_list
[
index
]
.
text
# 进入分类页面
# 进入分类页面
classify_item_list
[
index
]
.
click
()
classify_item_list
[
index
]
.
click
()
...
@@ -233,7 +233,8 @@ def reptile(browser=None, search_word=""):
...
@@ -233,7 +233,8 @@ def reptile(browser=None, search_word=""):
"localPath"
:
local_path
,
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
"keyword"
:
keyword
,
"total"
:
len
(
data
)
}
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
if
state_save
:
...
...
pc_twitter.py
View file @
af6aac0d
...
@@ -214,7 +214,8 @@ def reptile(browser=None, search_word=""):
...
@@ -214,7 +214,8 @@ def reptile(browser=None, search_word=""):
"localPath"
:
local_path
,
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
"keyword"
:
keyword
,
"total"
:
len
(
data
)
}
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
if
state_save
:
...
...
pc_youtube.py
View file @
af6aac0d
...
@@ -114,7 +114,8 @@ def reptile(browser=None, search_word=""):
...
@@ -114,7 +114,8 @@ def reptile(browser=None, search_word=""):
"localPath"
:
local_path
,
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
"keyword"
:
keyword
,
"total"
:
len
(
data
)
}
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
if
state_save
:
...
...
test.py
View file @
af6aac0d
import
io
import
mysql.connector
import
json
import
json
import
re
import
sys
# 连接到数据库
import
time
db_config
=
{
import
loguru
"host"
:
"8.142.151.250"
,
# import pymysql.cursors
"user"
:
"script"
,
import
requests
"password"
:
"changfA123$"
,
from
bs4
import
BeautifulSoup
"database"
:
"network_assets"
from
datetime
import
datetime
}
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.Logger
import
log
connection
=
mysql
.
connector
.
connect
(
**
db_config
)
from
utils.index
import
convert_to_traditional
,
create_directory_if_not_exists
,
delete_directory
cursor
=
connection
.
cursor
()
# from requests_toolbelt import *
from
utils.createBrowserDriver
import
create
# 设置 group_concat_max_len,设置返回字符串长度限制为 1,000,000 字符
import
opencc
cursor
.
execute
(
"SET SESSION group_concat_max_len = 1000000"
)
from
utils.filse
import
save_json
import
os
# 执行SQL查询语句
from
config.settings
import
get_base_file_url
sql_query
=
"""
from
utils.download_image
import
download_image
SELECT
# --------------- selenium 依赖 start ----------------
country_code AS countryCode,
from
selenium.webdriver.common.by
import
By
CONCAT(
from
selenium.webdriver.support.ui
import
WebDriverWait
'{"countryCode": "', REPLACE(country_code, '"', '
\\
"'), '", ',
from
selenium.webdriver.support
import
expected_conditions
as
EC
'"ASInfoList": [',
GROUP_CONCAT(
# --------------- selenium 依赖 end ----------------
CONCAT(
'''
'{"topology": false, "ASType": "', REPLACE(type, '"', '
\\
"'),
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
'", "linkedNumber": ', connect_degree,
', "ASNumber": ', as_number,
爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
', "ASDegrees": ', transmit_degree,
'''
', "countryCode": "', REPLACE(country_code, '"', '
\\
"'), '"}'
)
SEPARATOR ', '
def
reptile
(
browser
=
None
,
search_word
=
""
):
),
url
=
"https://skynet.ipplus360.com/q.html"
'], ',
browser
=
browser
or
create
(
no_headless
=
False
,
using_user_data
=
True
)
'"countryName": "', REPLACE(country, '"', '
\\
"'), '"}'
# 有头模式执行
) AS result
# browser = browser or create()
FROM as_info
# 打开网页
GROUP BY country_code, country
browser
.
get
(
url
)
"""
print
(
"------"
)
print
(
browser
.
page_source
)
cursor
.
execute
(
sql_query
)
# log.debug("已打开浏览器")
query_result
=
cursor
.
fetchall
()
# classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
# 关闭数据库连接
reptile
()
cursor
.
close
()
\ No newline at end of file
connection
.
close
()
# 将查询结果转换为正确格式的数据
formatted_result
=
[]
for
row
in
query_result
:
country_code
=
row
[
0
]
result_data
=
row
[
1
]
# 转换非字符串类型为字符串
if
isinstance
(
result_data
,
(
list
,
tuple
)):
result_data
=
[
str
(
item
)
for
item
in
result_data
]
# # 构建 JSON 数据
# json_data = {
# "countryCode": country_code,
# "ASInfoList": result_data
# }
data
=
json
.
loads
(
result_data
)
formatted_result
.
append
(
data
)
# 将结果导出到 JSON 文件
output_file_path
=
"./output.json"
with
open
(
output_file_path
,
"w"
,
encoding
=
"utf-8"
)
as
json_file
:
json
.
dump
(
formatted_result
,
json_file
,
indent
=
4
,
ensure_ascii
=
False
)
print
(
f
"查询结果已导出到 {output_file_path}"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment