Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
45591aa3
Commit
45591aa3
authored
Jul 20, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:selenium 驱动配置
parent
1b3d2164
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
93 additions
and
18 deletions
+93
-18
pc_facebook.py
pc_facebook.py
+1
-0
pc_ptt.py
pc_ptt.py
+1
-1
pc_twitter.py
pc_twitter.py
+51
-9
test.py
test.py
+16
-0
createBrowserDriver.py
utils/createBrowserDriver.py
+22
-6
index.py
utils/index.py
+2
-2
No files found.
pc_facebook.py
View file @
45591aa3
...
@@ -81,6 +81,7 @@ def reptile(browser=None, search_word=""):
...
@@ -81,6 +81,7 @@ def reptile(browser=None, search_word=""):
image_list
=
soup
.
find_all
(
"img"
)
image_list
=
soup
.
find_all
(
"img"
)
# lth = len(ignore_list)
# lth = len(ignore_list)
if
len
(
video_list
)
>
0
:
if
len
(
video_list
)
>
0
:
# for key,element in enumerate(video_list):
# 删除第二个子元素
# 删除第二个子元素
# 找到包含两个 <div> 元素的父级元素
# 找到包含两个 <div> 元素的父级元素
parent_div
=
soup
.
find
(
'div'
)
parent_div
=
soup
.
find
(
'div'
)
...
...
pc_ptt.py
View file @
45591aa3
...
@@ -27,7 +27,7 @@ import os
...
@@ -27,7 +27,7 @@ import os
def
reptile
(
browser
=
None
,
search_word
=
""
):
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://www.ptt.cc/bbs/hotboards.html"
url
=
"https://www.ptt.cc/bbs/hotboards.html"
# 无头模式执行
# 无头模式执行
browser
=
browser
or
create
([
'--headless'
])
browser
=
browser
or
create
([
'--headless'
]
,
False
)
# 有头模式执行
# 有头模式执行
# browser = browser or create()
# browser = browser or create()
# 打开网页
# 打开网页
...
...
pc_twitter.py
View file @
45591aa3
...
@@ -9,6 +9,7 @@ from utils.index import convert_to_traditional, yt_dlp_download, convert_string_
...
@@ -9,6 +9,7 @@ from utils.index import convert_to_traditional, yt_dlp_download, convert_string_
# from pytube import YouTube
# from pytube import YouTube
import
os
import
os
from
datetime
import
datetime
from
datetime
import
datetime
from
utils.download_image
import
download_image
from
config.settings
import
get_base_file_url
from
config.settings
import
get_base_file_url
# 工具函数-下载图片
# 工具函数-下载图片
...
@@ -21,7 +22,7 @@ def reptile(browser=None, search_word=""):
...
@@ -21,7 +22,7 @@ def reptile(browser=None, search_word=""):
base_url
=
"https://twitter.com/"
base_url
=
"https://twitter.com/"
option
=
[
'--headless'
]
option
=
[
'--headless'
]
# ['--headless']
# ['--headless']
browser
=
browser
or
create
(
None
,
Fals
e
)
browser
=
browser
or
create
(
None
,
Tru
e
)
# print(browser)
# print(browser)
# 打开网页
# 打开网页
browser
.
get
(
base_url
)
browser
.
get
(
base_url
)
...
@@ -36,12 +37,13 @@ def reptile(browser=None, search_word=""):
...
@@ -36,12 +37,13 @@ def reptile(browser=None, search_word=""):
time
.
sleep
(
3
)
time
.
sleep
(
3
)
password_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='current-password']"
)
password_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='current-password']"
)
password_input
.
send_keys
(
"liyang19970814"
)
password_input
.
send_keys
(
"liyang19970814"
)
# 获取登录按钮
#
#
获取登录按钮
button_login
=
browser
.
find_element
(
'xpath'
,
"//div[@data-testid='LoginForm_Login_Button']"
)
button_login
=
browser
.
find_element
(
'xpath'
,
"//div[@data-testid='LoginForm_Login_Button']"
)
button_login
.
click
()
button_login
.
click
()
except
:
except
:
print
(
"------"
)
print
(
"------"
)
time
.
sleep
(
2
)
time
.
sleep
(
2
)
# print("1111")
url
=
'https://twitter.com/search?q='
+
search_word
+
'&src=typed_query'
url
=
'https://twitter.com/search?q='
+
search_word
+
'&src=typed_query'
browser
.
get
(
url
)
browser
.
get
(
url
)
time
.
sleep
(
4
)
time
.
sleep
(
4
)
...
@@ -52,16 +54,10 @@ def reptile(browser=None, search_word=""):
...
@@ -52,16 +54,10 @@ def reptile(browser=None, search_word=""):
element_content_list
=
browser
.
find_elements
(
'xpath'
,
base_xpath
)
element_content_list
=
browser
.
find_elements
(
'xpath'
,
base_xpath
)
# 作者
# 作者
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
f
"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']"
)
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
f
"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']"
)
# time.sleep(2)
# 发布时间
# element_release_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time")
# time_a_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time/..")
# print(element_content_list)
length
=
len
(
element_authors_list
)
length
=
len
(
element_authors_list
)
for
index
in
range
(
length
):
for
index
in
range
(
length
):
# print(index)
# print(index)
content
=
element_content_list
[
index
]
.
get_attribute
(
"outerHTML"
)
soup
=
BeautifulSoup
(
element_content_list
[
index
]
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
soup
=
BeautifulSoup
(
content
,
"html.parser"
)
# 查找time标签
# 查找time标签
time_soup
=
soup
.
find
(
'time'
)
time_soup
=
soup
.
find
(
'time'
)
timestamp
=
datetime
.
fromisoformat
(
time_soup
[
'datetime'
]
.
replace
(
"Z"
,
"+00:00"
))
.
timestamp
()
timestamp
=
datetime
.
fromisoformat
(
time_soup
[
'datetime'
]
.
replace
(
"Z"
,
"+00:00"
))
.
timestamp
()
...
@@ -70,6 +66,52 @@ def reptile(browser=None, search_word=""):
...
@@ -70,6 +66,52 @@ def reptile(browser=None, search_word=""):
author
=
element_authors_list
[
index
]
.
text
author
=
element_authors_list
[
index
]
.
text
# 标题取:作者+日期
# 标题取:作者+日期
title
=
f
"{author}-{datetime.fromtimestamp(int(timestamp))}"
title
=
f
"{author}-{datetime.fromtimestamp(int(timestamp))}"
video_list
=
soup
.
find_all
(
"video"
)
image_list
=
soup
.
find_all
(
"img"
)
# lth = len(ignore_list)
if
len
(
video_list
)
>
0
:
# for key,element in enumerate(video_list):
# 删除第二个子元素
# 找到包含两个 <div> 元素的父级元素
parent_div
=
soup
.
find
(
'div'
)
# 找到所有的 <div> 子元素
div_elements
=
parent_div
.
find_all
(
'div'
,
recursive
=
False
)
# div_tags = soup.find_all("div", recursive=False)
# 确保列表中至少有两个 <div> 子元素
if
len
(
div_elements
)
>=
2
:
# 获取第二个 <div> 元素,并将其从父级元素中移除
div_to_remove
=
div_elements
[
1
]
div_to_remove
.
extract
()
# 删除
# div.decompose()
# 创建video标签占位
custom_video
=
soup
.
new_tag
(
"video"
)
custom_video
[
"src"
]
=
""
parent_div
.
append
(
custom_video
)
else
:
print
(
""
)
picture_url
=
[]
if
len
(
image_list
)
>
0
:
for
key
,
element
in
enumerate
(
image_list
):
# 下载图片至本地,替换标签中的src
id
=
str
(
int
(
time
.
time
()))
# 下载地址
download_dir
=
f
'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address
=
f
'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status
=
download_image
(
element
[
'src'
],
download_dir
)
if
status
:
element
[
'src'
]
=
access_address
picture_url
.
append
(
access_address
)
else
:
print
(
""
)
content
=
soup
.
prettify
()
# ---------------- 判断类型 start ----------
# ---------------- 判断类型 start ----------
# 类型
# 类型
content_type
=
""
content_type
=
""
...
...
test.py
0 → 100644
View file @
45591aa3
# from utils.index import yt_dlp_download
#
# status = yt_dlp_download("https://www.facebook.com/e5627ead-8b9a-48fd-820f-ee242cc08bbb", "facebook")
# print(status)
import
time
from
selenium.webdriver
import
Firefox
from
selenium
import
webdriver
driver
=
webdriver
.
Firefox
()
driver
.
get
(
"https://www.toutiao.com/a6969138023774667264/"
)
time
.
sleep
(
2
)
html
=
driver
.
page_source
print
(
html
)
driver
.
quit
()
\ No newline at end of file
utils/createBrowserDriver.py
View file @
45591aa3
import
os.path
import
os
import
platform
import
sys
import
sys
from
selenium
import
webdriver
from
selenium
import
webdriver
from
selenium.webdriver.chrome.options
import
Options
from
selenium.webdriver.chrome.options
import
Options
...
@@ -6,7 +7,8 @@ from selenium.webdriver.chrome.service import Service
...
@@ -6,7 +7,8 @@ from selenium.webdriver.chrome.service import Service
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support.ui
import
WebDriverWait
import
chromedriver_autoinstaller
# from mozprofile import FirefoxProfile
'''
'''
创建浏览器实例
创建浏览器实例
'''
'''
...
@@ -19,18 +21,32 @@ def create(option=None, using_user_data=True):
...
@@ -19,18 +21,32 @@ def create(option=None, using_user_data=True):
:param option:
:param option:
:return:
:return:
"""
"""
# 安装或升级 chromedriver
chromedriver_autoinstaller
.
install
()
# 获取现有Chrome浏览器用户数据目录
# chrome_user_data_dir = ""
# if platform.system() == 'Windows':
# chrome_user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Google', 'Chrome',
# 'User Data')
# elif platform.system() == 'Linux':
# chrome_user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'google-chrome')
# elif platform.system() == 'Darwin':
# chrome_user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Google','Chrome')
# else:
# raise Exception('Unsupported operating system')
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
=
webdriver
.
ChromeOptions
()
if
option
is
not
None
:
if
option
is
not
None
:
for
value
in
option
:
for
value
in
option
:
chrome_options
.
add_argument
(
value
)
chrome_options
.
add_argument
(
value
)
# 启用浏览器的持久性会话,可以保存登录状态和Cookie
# 启用浏览器的持久性会话,可以保存登录状态和Cookie
# 使用本地
user_data_dir
=
os
.
path
.
join
(
os
.
path
.
abspath
(
"../"
),
'network-assets-reptile'
,
'user_data'
)
user_data_dir
=
os
.
path
.
join
(
os
.
path
.
abspath
(
"../"
),
'network-assets-reptile'
,
'user_data'
)
script
=
f
'--user-data-dir={user_data_dir}'
# print(script)
# log.debug(script)
if
using_user_data
:
if
using_user_data
:
chrome_options
.
add_argument
(
script
)
# 设置一个自定义的用户配置文件路径
# 添加用户数据目录参数
chrome_options
.
add_argument
(
f
'--user-data-dir={user_data_dir}'
)
if
sys
.
platform
.
startswith
(
'linux'
):
if
sys
.
platform
.
startswith
(
'linux'
):
# print("当前系统是 Linux")
# print("当前系统是 Linux")
...
...
utils/index.py
View file @
45591aa3
...
@@ -84,8 +84,8 @@ def parse_twitter_time_string(time_str):
...
@@ -84,8 +84,8 @@ def parse_twitter_time_string(time_str):
"""
"""
times
=
parser
.
parse
(
time_str
,
fuzzy
=
True
)
times
=
parser
.
parse
(
time_str
,
fuzzy
=
True
)
# a = datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S")
# a = datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S")
b
=
datetime
.
datetime
.
strftime
(
times
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
b
=
datetime
.
datetime
.
strftime
(
times
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
c
=
time
.
mktime
(
time
.
strptime
(
b
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
))
c
=
time
.
mktime
(
time
.
strptime
(
b
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
))
# 解析相对时间字符串
# 解析相对时间字符串
return
c
return
c
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment