Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
00b24977
Commit
00b24977
authored
Jul 13, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:去除富文本转义符号
parent
22acb3c5
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
58 additions
and
34 deletions
+58
-34
pc_twitter.py
pc_twitter.py
+56
-32
pc_youtube.py
pc_youtube.py
+2
-2
No files found.
pc_twitter.py
View file @
00b24977
# # 导入依赖库
import
json
import
time
from
telnetlib
import
EC
from
bs4
import
BeautifulSoup
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.wait
import
WebDriverWait
from
utils.Logger
import
log
from
utils.createBrowserDriver
import
create
from
utils.filse
import
save_json
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
# from pytube import YouTube
import
os
from
config.settings
import
get_base_file_url
# 工具函数-下载图片
'''
...
...
@@ -14,36 +16,30 @@ from utils.createBrowserDriver import create
'''
def
reptile
(
browser
):
# # json 数据
data
=
[]
image_key
=
0
fileDir
=
"./reptile_data/news/nytimes/"
# year = datetime(2021, 1, 1)
# startDate = datetime(2020, 12, 31) # 初始日期
# endDate = datetime(2020, 12, 31) # 结束日期
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://twitter.com/"
browser
=
browser
or
create
(
False
)
print
(
browser
)
option
=
[
'--headless'
]
# ['--headless']
browser
=
browser
or
create
()
# print(browser)
# browser = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver')
# endDate = startDate = startDate + timedelta(days=i)
# 打开网页
browser
.
get
(
url
)
# WebDriverWait(browser,10).
# 打开登录窗口
open_button_login
=
WebDriverWait
(
browser
,
10
)
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//a[@data-testid='login']"
)))
open_button_login
.
click
()
time
.
sleep
(
5
)
#
open_button_login = WebDriverWait(browser, 10).until(
#
EC.presence_of_element_located((By.XPATH, "//a[@data-testid='login']")))
#
open_button_login.click()
#
time.sleep(5)
# 获取账号密码输入框
input_email_element
=
WebDriverWait
(
browser
,
10
)
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//input[@autocomplete='username']"
)))
# 获取下一步按钮
buttons
=
WebDriverWait
(
browser
,
10
)
.
until
(
EC
.
presence_of_all_elements_located
((
By
.
XPATH
,
"//div[@role='button']"
)))
for
item
in
buttons
:
print
(
BeautifulSoup
(
item
,
'html.parser'
))
#
input_email_element = WebDriverWait(browser, 10).until(
#
EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']")))
#
#
获取下一步按钮
#
buttons = WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@role='button']")))
#
for item in buttons:
#
print(BeautifulSoup(item, 'html.parser'))
# soup = BeautifulSoup(page_content, 'html.parser')
# input_pwd_element = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, "//input[@name='pass']")))
# # 获取登录按钮
...
...
@@ -68,8 +64,8 @@ def reptile(browser):
# break
# time.sleep(3)
# 获取完整的分页数据
page_content
=
browser
.
page_source
soup
=
BeautifulSoup
(
page_content
,
'html.parser'
)
#
page_content = browser.page_source
#
soup = BeautifulSoup(page_content, 'html.parser')
# print("----------")
# print(soup)
# list_news = soup.find_all('li', {"class": "css-1l4w6pd"})
...
...
@@ -105,6 +101,34 @@ def reptile(browser):
# with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
# json.dump(data, file, indent=2, ensure_ascii=False)
browser
.
close
()
# 关闭浏览器驱动
browser
.
quit
()
# browser.close()
# # 关闭浏览器驱动
# browser.quit()
def
main
():
"""
"""
# 请求关键词
response
=
getReptileTask
()
# print(response)
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
search_word
=
""
for
item
in
response
[
'data'
][
'rows'
]:
if
item
[
'name'
]
==
'twitter'
:
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
reptile
(
None
,
convert_to_traditional
(
search_word
))
else
:
log
.
debug
(
"call failed"
)
reptile
(
None
,
''
)
# upload_control()
# 全局变量
data
=
[]
table_name
=
"pms_twitter"
# 调用main函数
main
()
\ No newline at end of file
pc_youtube.py
View file @
00b24977
...
...
@@ -29,7 +29,7 @@ def reptile(browser=None, search_word=""):
# print(classify_item_list)
length
=
len
(
classify_video_list
)
for
index
in
range
(
length
):
if
0
<
index
<
2
:
if
-
1
<
index
<
length
:
title
=
classify_video_list
[
index
]
.
get_attribute
(
'title'
)
link
=
classify_video_list
[
index
]
.
get_attribute
(
'href'
)
# yt = YouTube(link)
...
...
@@ -47,7 +47,7 @@ def reptile(browser=None, search_word=""):
# 组装数据
obj
=
{
"title"
:
title
,
"content"
:
f
"<video src='{file_http_src}'></video>"
,
"content"
:
f
"<video
controls style='width:100
%
'
src='{file_http_src}'></video>"
,
"videoUrl"
:
file_http_src
,
"link"
:
link
,
"reptileTime"
:
str
(
int
(
time
.
time
())),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment