Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
e0c2ddfc
Commit
e0c2ddfc
authored
Jul 14, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:爬虫优化
parent
ea343def
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
15 additions
and
6 deletions
+15
-6
pc_twitter.py
pc_twitter.py
+8
-5
index.py
utils/index.py
+7
-1
No files found.
pc_twitter.py
View file @
e0c2ddfc
...
@@ -5,7 +5,7 @@ from utils.Logger import log
...
@@ -5,7 +5,7 @@ from utils.Logger import log
from
utils.createBrowserDriver
import
create
from
utils.createBrowserDriver
import
create
from
utils.filse
import
save_json
from
utils.filse
import
save_json
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
,
parse_twitter_time_string
# from pytube import YouTube
# from pytube import YouTube
import
os
import
os
from
config.settings
import
get_base_file_url
from
config.settings
import
get_base_file_url
...
@@ -20,7 +20,7 @@ def reptile(browser=None, search_word=""):
...
@@ -20,7 +20,7 @@ def reptile(browser=None, search_word=""):
url
=
"https://twitter.com/"
url
=
"https://twitter.com/"
option
=
[
'--headless'
]
option
=
[
'--headless'
]
# ['--headless']
# ['--headless']
browser
=
browser
or
create
(
option
,
False
)
browser
=
browser
or
create
(
None
,
False
)
# print(browser)
# print(browser)
# 打开网页
# 打开网页
browser
.
get
(
url
)
browser
.
get
(
url
)
...
@@ -41,7 +41,7 @@ def reptile(browser=None, search_word=""):
...
@@ -41,7 +41,7 @@ def reptile(browser=None, search_word=""):
except
:
except
:
print
(
"------"
)
print
(
"------"
)
time
.
sleep
(
2
)
time
.
sleep
(
2
)
url
=
'https://twitter.com/search?q='
+
search_word
+
'&src=typed_query'
url
=
'https://twitter.com/search?q='
+
search_word
+
'&src=typed_query'
browser
.
get
(
url
)
browser
.
get
(
url
)
time
.
sleep
(
3
)
time
.
sleep
(
3
)
# 内容块
# 内容块
...
@@ -57,7 +57,10 @@ def reptile(browser=None, search_word=""):
...
@@ -57,7 +57,10 @@ def reptile(browser=None, search_word=""):
length
=
len
(
element_authors_list
)
length
=
len
(
element_authors_list
)
for
index
in
range
(
length
):
for
index
in
range
(
length
):
author
=
element_authors_list
[
index
]
.
text
author
=
element_authors_list
[
index
]
.
text
try
:
release_time
=
str
(
int
(
parse_twitter_time_string
(
element_release_list
[
index
]
.
text
)))
release_time
=
str
(
int
(
parse_twitter_time_string
(
element_release_list
[
index
]
.
text
)))
except
:
release_time
=
str
(
int
(
time
.
time
()))
content
=
element_content_list
[
index
]
.
get_attribute
(
"innerHTML"
)
content
=
element_content_list
[
index
]
.
get_attribute
(
"innerHTML"
)
# print(content)
# print(content)
# 内容过滤
# 内容过滤
...
@@ -141,7 +144,7 @@ def main():
...
@@ -141,7 +144,7 @@ def main():
table_name
=
item
[
'tableName'
]
table_name
=
item
[
'tableName'
]
status_task
=
int
(
item
[
"status"
])
status_task
=
int
(
item
[
"status"
])
# 简体转繁体
# 简体转繁体
if
status_task
==
0
and
len
(
search_word
)
>
0
:
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
reptile
(
None
,
convert_to_traditional
(
search_word
))
else
:
else
:
log
.
debug
(
"爬取任务未启用"
)
log
.
debug
(
"爬取任务未启用"
)
...
...
utils/index.py
View file @
e0c2ddfc
import
os.path
import
os.path
import
re
import
re
import
time
import
opencc
import
opencc
from
pytube
import
YouTube
from
pytube
import
YouTube
import
ssl
import
ssl
...
@@ -80,8 +82,12 @@ def parse_twitter_time_string(time_str):
...
@@ -80,8 +82,12 @@ def parse_twitter_time_string(time_str):
:param time_str:
:param time_str:
:return:
:return:
"""
"""
times
=
parser
.
parse
(
time_str
,
fuzzy
=
True
)
# a = datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S")
b
=
datetime
.
datetime
.
strftime
(
times
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
c
=
time
.
mktime
(
time
.
strptime
(
b
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
))
# 解析相对时间字符串
# 解析相对时间字符串
return
datetime
.
timestamp
(
parser
.
parse
(
time_str
,
fuzzy
=
True
))
return
c
def
convert_to_traditional
(
simplified_text
):
def
convert_to_traditional
(
simplified_text
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment