Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
653b114a
Commit
653b114a
authored
Jul 31, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:youtube debug
parent
c4a794b8
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
148 additions
and
66 deletions
+148
-66
pc_dcard.py
pc_dcard.py
+17
-8
pc_facebook.py
pc_facebook.py
+17
-8
pc_instagram.py
pc_instagram.py
+17
-8
pc_ptt.py
pc_ptt.py
+17
-9
pc_twitter.py
pc_twitter.py
+17
-8
pc_youtube.py
pc_youtube.py
+17
-8
test.py
test.py
+46
-17
No files found.
pc_dcard.py
View file @
653b114a
...
...
@@ -60,7 +60,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime
=
int
(
timestamp
)
if
new_releaseTime
<
filter_time_start
or
new_releaseTime
>
filter_time_end
:
if
new_releaseTime
<
beginFiltrationTime
or
new_releaseTime
>
endFiltrationTime
:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
...
...
@@ -180,7 +180,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime"
:
data
[
0
][
"reptileTime"
],
# 本地路径
"localPath"
:
local_path
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
...
...
@@ -221,8 +224,9 @@ def main():
# 请求关键词
response
=
getReptileTask
()
global
status_task
global
filter_time_start
global
filter_time_end
global
beginFiltrationTime
global
endFiltrationTime
global
keyword
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
search_word
=
""
...
...
@@ -231,8 +235,9 @@ def main():
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
status_task
=
int
(
item
[
"status"
])
filter_time_start
=
int
(
item
[
"beginFiltrationTime"
])
filter_time_end
=
int
(
item
[
"endFiltrationTime"
])
keyword
=
str
(
item
[
"keyword"
])
beginFiltrationTime
=
int
(
item
[
"beginFiltrationTime"
])
endFiltrationTime
=
int
(
item
[
"endFiltrationTime"
])
# 简体转繁体
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
...
...
@@ -250,10 +255,14 @@ data = []
# 任务详情
task
=
{}
table_name
=
"pms_dcard"
# 全局字段
keyword
=
""
# 过滤时间开始
filter_time_start
=
int
(
123
)
beginFiltrationTime
=
int
(
123
)
# 过滤时间结束
filter_time_end
=
int
(
123
)
endFiltrationTime
=
int
(
123
)
# 文件根目录
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
...
...
pc_facebook.py
View file @
653b114a
...
...
@@ -88,7 +88,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime
=
int
(
release_time
)
if
new_releaseTime
<
filter_time_start
or
new_releaseTime
>
filter_time_end
:
if
new_releaseTime
<
beginFiltrationTime
or
new_releaseTime
>
endFiltrationTime
:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
...
...
@@ -179,7 +179,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime"
:
data
[
0
][
"reptileTime"
],
# 本地路径
"localPath"
:
local_path
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
...
...
@@ -220,8 +223,9 @@ def main():
# 请求关键词
response
=
getReptileTask
()
global
status_task
global
filter_time_start
global
filter_time_end
global
beginFiltrationTime
global
endFiltrationTime
global
keyword
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
search_word
=
""
...
...
@@ -230,8 +234,9 @@ def main():
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
status_task
=
int
(
item
[
"status"
])
filter_time_start
=
int
(
item
[
"beginFiltrationTime"
])
filter_time_end
=
int
(
item
[
"endFiltrationTime"
])
keyword
=
str
(
item
[
"keyword"
])
beginFiltrationTime
=
int
(
item
[
"beginFiltrationTime"
])
endFiltrationTime
=
int
(
item
[
"endFiltrationTime"
])
# 简体转繁体
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
...
...
@@ -249,10 +254,14 @@ data = []
# 任务详情
task
=
{}
table_name
=
"pms_facebook"
# 全局字段
keyword
=
""
# 过滤时间开始
filter_time_start
=
int
(
123
)
beginFiltrationTime
=
int
(
123
)
# 过滤时间结束
filter_time_end
=
int
(
123
)
endFiltrationTime
=
int
(
123
)
# 文件根目录
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
...
...
pc_instagram.py
View file @
653b114a
...
...
@@ -87,7 +87,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime
=
int
(
timestamp
)
if
new_releaseTime
<
filter_time_start
or
new_releaseTime
>
filter_time_end
:
if
new_releaseTime
<
beginFiltrationTime
or
new_releaseTime
>
endFiltrationTime
:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
...
...
@@ -170,7 +170,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime"
:
data
[
0
][
"reptileTime"
],
# 本地路径
"localPath"
:
local_path
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
...
...
@@ -210,8 +213,9 @@ def main():
# 请求关键词
response
=
getReptileTask
()
global
status_task
global
filter_time_start
global
filter_time_end
global
beginFiltrationTime
global
endFiltrationTime
global
keyword
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
search_word
=
""
...
...
@@ -220,8 +224,9 @@ def main():
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
status_task
=
int
(
item
[
"status"
])
filter_time_start
=
int
(
item
[
"beginFiltrationTime"
])
filter_time_end
=
int
(
item
[
"endFiltrationTime"
])
keyword
=
str
(
item
[
"keyword"
])
beginFiltrationTime
=
int
(
item
[
"beginFiltrationTime"
])
endFiltrationTime
=
int
(
item
[
"endFiltrationTime"
])
# 简体转繁体
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
...
...
@@ -239,10 +244,14 @@ data = []
# 任务详情
task
=
{}
table_name
=
"pms_instagram"
# 全局字段
keyword
=
""
# 过滤时间开始
filter_time_start
=
int
(
123
)
beginFiltrationTime
=
int
(
123
)
# 过滤时间结束
filter_time_end
=
int
(
123
)
endFiltrationTime
=
int
(
123
)
# 文件根目录
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
...
...
pc_ptt.py
View file @
653b114a
...
...
@@ -190,7 +190,7 @@ def reptile(browser=None, search_word=""):
release_time
=
int
(
date_time
.
timestamp
())
# 过滤时间
if
filter_time_start
<=
release_time
<=
filter_time_end
:
if
beginFiltrationTime
<=
release_time
<=
endFiltrationTime
:
# --------------- 组装数据 start---------------------
obj
=
{
"title"
:
element_title
.
text
,
...
...
@@ -230,7 +230,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime"
:
data
[
0
][
"reptileTime"
],
# 本地路径
"localPath"
:
local_path
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
...
...
@@ -263,7 +266,6 @@ def script_close(browser):
print
(
"sys.exit() 执行失败"
)
def
main
():
"""
...
...
@@ -271,8 +273,9 @@ def main():
# 请求关键词
response
=
getReptileTask
()
global
status_task
global
filter_time_start
global
filter_time_end
global
beginFiltrationTime
global
endFiltrationTime
global
keyword
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
search_word
=
""
...
...
@@ -281,8 +284,9 @@ def main():
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
status_task
=
int
(
item
[
"status"
])
filter_time_start
=
int
(
item
[
"beginFiltrationTime"
])
filter_time_end
=
int
(
item
[
"endFiltrationTime"
])
keyword
=
str
(
item
[
"keyword"
])
beginFiltrationTime
=
int
(
item
[
"beginFiltrationTime"
])
endFiltrationTime
=
int
(
item
[
"endFiltrationTime"
])
# 简体转繁体
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
...
...
@@ -300,10 +304,14 @@ data = []
# 任务详情
task
=
{}
table_name
=
"pms_ptt"
# 全局字段
keyword
=
""
# 过滤时间开始
filter_time_start
=
int
(
123
)
beginFiltrationTime
=
int
(
123
)
# 过滤时间结束
filter_time_end
=
int
(
123
)
endFiltrationTime
=
int
(
123
)
# 文件根目录
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
...
...
pc_twitter.py
View file @
653b114a
...
...
@@ -95,7 +95,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime
=
int
(
timestamp
)
if
new_releaseTime
<
filter_time_start
or
new_releaseTime
>
filter_time_end
:
if
new_releaseTime
<
beginFiltrationTime
or
new_releaseTime
>
endFiltrationTime
:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
...
...
@@ -200,7 +200,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime"
:
data
[
0
][
"reptileTime"
],
# 本地路径
"localPath"
:
local_path
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
...
...
@@ -241,8 +244,9 @@ def main():
# 请求关键词
response
=
getReptileTask
()
global
status_task
global
filter_time_start
global
filter_time_end
global
beginFiltrationTime
global
endFiltrationTime
global
keyword
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
search_word
=
""
...
...
@@ -250,9 +254,10 @@ def main():
if
item
[
'name'
]
==
'twitter'
:
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
keyword
=
str
(
item
[
"keyword"
])
status_task
=
int
(
item
[
"status"
])
filter_time_start
=
int
(
item
[
"beginFiltrationTime"
])
filter_time_end
=
int
(
item
[
"endFiltrationTime"
])
beginFiltrationTime
=
int
(
item
[
"beginFiltrationTime"
])
endFiltrationTime
=
int
(
item
[
"endFiltrationTime"
])
# 简体转繁体
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
...
...
@@ -270,10 +275,14 @@ data = []
# 任务详情
task
=
{}
table_name
=
"pms_twitter"
# 全局字段
keyword
=
""
# 过滤时间开始
filter_time_start
=
int
(
123
)
beginFiltrationTime
=
int
(
123
)
# 过滤时间结束
filter_time_end
=
int
(
123
)
endFiltrationTime
=
int
(
123
)
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
local_path_name
=
str
(
int
(
time
.
time
()))
...
...
pc_youtube.py
View file @
653b114a
...
...
@@ -73,7 +73,7 @@ def reptile(browser=None, search_word=""):
# 过滤时间
# # 如果'releaseTime'不是整数,则将其转换为整数
new_releaseTime
=
int
(
releaseTime
)
if
new_releaseTime
<
filter_time_start
or
new_releaseTime
>
filter_time_end
:
if
new_releaseTime
<
beginFiltrationTime
or
new_releaseTime
>
endFiltrationTime
:
# 如果'new_releaseTime'不在指定范围内,则跳过当前迭代,继续下一个项目
continue
...
...
@@ -111,7 +111,10 @@ def reptile(browser=None, search_word=""):
# 爬取时间
"reptileTime"
:
data
[
0
][
"reptileTime"
],
# 本地路径
"localPath"
:
local_path
"localPath"
:
local_path
,
"beginFiltrationTime"
:
beginFiltrationTime
,
"endFiltrationTime"
:
endFiltrationTime
,
"keyword"
:
keyword
}
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
"task.json"
),
task
)
if
state_save
:
...
...
@@ -151,8 +154,9 @@ def main():
# 请求关键词
response
=
getReptileTask
()
global
status_task
global
filter_time_start
global
filter_time_end
global
beginFiltrationTime
global
endFiltrationTime
global
keyword
# print(response)
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
...
...
@@ -162,8 +166,9 @@ def main():
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
status_task
=
int
(
item
[
"status"
])
filter_time_start
=
int
(
item
[
"beginFiltrationTime"
])
filter_time_end
=
int
(
item
[
"endFiltrationTime"
])
keyword
=
str
(
item
[
"keyword"
])
beginFiltrationTime
=
int
(
item
[
"beginFiltrationTime"
])
endFiltrationTime
=
int
(
item
[
"endFiltrationTime"
])
# 简体转繁体
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
...
...
@@ -181,10 +186,14 @@ data = []
# 任务详情
task
=
{}
table_name
=
"pms_youtube"
# 全局字段
keyword
=
""
# 过滤时间开始
filter_time_start
=
int
(
123
)
beginFiltrationTime
=
int
(
123
)
# 过滤时间结束
filter_time_end
=
int
(
123
)
endFiltrationTime
=
int
(
123
)
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", table_name.split("_")[1])}'
# 任务目录名称
local_path_name
=
str
(
int
(
time
.
time
()))
...
...
test.py
View file @
653b114a
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
# log.debug(classify_item_list)
length
=
len
(
classify_item_list
)
for
index
in
range
(
length
):
# 暂时先爬取 第2个 分类
if
0
<=
index
<
4
:
type_title
=
classify_item_list
[
index
]
.
text
classify_item_list
[
index
]
.
click
()
time
.
sleep
(
0.1
)
for
index_two
in
range
(
length_two
):
print
(
element_list
[
index_two
]
.
text
)
# 浏览器返回上一页
browser
.
back
()
if
index
==
0
:
browser
.
back
()
time
.
sleep
(
0.1
)
classify_item_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@class='board-class']"
)
\ No newline at end of file
import
io
import
json
import
re
import
sys
import
time
import
loguru
# import pymysql.cursors
import
requests
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.Logger
import
log
from
utils.index
import
convert_to_traditional
,
create_directory_if_not_exists
,
delete_directory
# from requests_toolbelt import *
from
utils.createBrowserDriver
import
create
import
opencc
from
utils.filse
import
save_json
import
os
from
config.settings
import
get_base_file_url
from
utils.download_image
import
download_image
# --------------- selenium 依赖 start ----------------
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
# --------------- selenium 依赖 end ----------------
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
爬取流程:创建驱动--》打开浏览器--》打开网页--》爬取分类元素--》循环点击--》爬取热门帖子标题--》循环点击--》爬取帖子详情
'''
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://skynet.ipplus360.com/q.html"
browser
=
browser
or
create
(
no_headless
=
False
,
using_user_data
=
True
)
# 有头模式执行
# browser = browser or create()
# 打开网页
browser
.
get
(
url
)
print
(
"------"
)
print
(
browser
.
page_source
)
# log.debug("已打开浏览器")
# classify_item_list = browser.find_elements('xpath', "//div[@class='board-class']")
reptile
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment