Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
1f24eff4
Commit
1f24eff4
authored
Jul 20, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:selenium 驱动配置
parent
781ee034
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
123 additions
and
62 deletions
+123
-62
pc_facebook.py
pc_facebook.py
+1
-1
pc_ptt.py
pc_ptt.py
+26
-8
pc_twitter.py
pc_twitter.py
+22
-13
pc_youtube.py
pc_youtube.py
+1
-1
createBrowserDriver.py
utils/createBrowserDriver.py
+73
-39
No files found.
pc_facebook.py
View file @
1f24eff4
...
...
@@ -22,7 +22,7 @@ def reptile(browser=None, search_word=""):
print
(
f
"搜索词:{search_word}"
)
url
=
"https://www.facebook.com/"
option
=
[
'--headless'
]
browser
=
browser
or
create
(
option
)
browser
=
browser
or
create
(
option
,
True
)
# 打开网页
browser
.
get
(
url
)
try
:
...
...
pc_ptt.py
View file @
1f24eff4
...
...
@@ -16,7 +16,8 @@ from utils.createBrowserDriver import create
import
opencc
from
utils.filse
import
save_json
import
os
from
config.settings
import
get_base_file_url
from
utils.download_image
import
download_image
'''
爬取台湾PTT论坛的热门帖子,包括帖子的标题、内容【文本、图片、视频】
...
...
@@ -27,7 +28,7 @@ import os
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://www.ptt.cc/bbs/hotboards.html"
# 无头模式执行
browser
=
browser
or
create
([
'--headless'
],
Fals
e
)
browser
=
browser
or
create
([
'--headless'
],
Tru
e
)
# 有头模式执行
# browser = browser or create()
# 打开网页
...
...
@@ -97,15 +98,31 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
# 查找所有img标签
image_list
=
soup
.
find_all
(
'img'
)
try
:
# 查找所有img标签
img_tags
=
soup
.
find_all
(
'img'
)
if
len
(
img_tags
)
>
0
:
if
len
(
image_list
)
>
0
:
content_type
=
"图文"
else
:
content_type
=
"文字"
except
:
content_type
=
"文字"
picture_url
=
[]
if
len
(
image_list
)
>
0
:
for
key
,
element
in
enumerate
(
image_list
):
# 下载图片至本地,替换标签中的src
id
=
str
(
int
(
time
.
time
()))
# 下载地址
download_dir
=
f
'{os.path.join(file_dir, f"{id}.jpg")}'
# 访问地址
access_address
=
f
'{get_base_file_url()}{table_name.split("_")[1]}/{id}.jpg'
# 下载状态
status
=
download_image
(
element
[
'src'
],
download_dir
)
if
status
:
element
[
'src'
]
=
access_address
picture_url
.
append
(
access_address
)
else
:
print
(
""
)
# ---------------- 判断类型 end ----------
# log.debug('开始内容过滤')
# ------------------ content 过滤 start--------------
...
...
@@ -146,7 +163,8 @@ def reptile(browser=None, search_word=""):
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"author"
:
element_author
.
text
,
"releaseTime"
:
release_time
"releaseTime"
:
release_time
,
"picture_url"
:
","
.
join
(
picture_url
)
}
# --------------- 组装数据 end---------------------
...
...
@@ -175,7 +193,7 @@ def reptile(browser=None, search_word=""):
if
len
(
data
)
>
0
:
# 保存json文件到本地
# log.debug(os.path.abspath("../"))
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
str
(
int
(
time
.
time
()))
+
".json"
),
data
)
state_save
=
save_json
(
os
.
path
.
join
(
file_dir
,
str
(
int
(
time
.
time
()))
+
".json"
),
data
)
if
state_save
:
log
.
debug
(
'save file success'
)
else
:
...
...
@@ -220,7 +238,7 @@ def main():
# 全局变量
data
=
[]
table_name
=
"pms_ptt"
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",
table_name.split("_")[1])}'
# 是否启用
status_task
=
'0'
# 调用main函数
...
...
pc_twitter.py
View file @
1f24eff4
...
...
@@ -19,10 +19,16 @@ from config.settings import get_base_file_url
def
reptile
(
browser
=
None
,
search_word
=
""
):
"""
:param browser:
:param search_word:
"""
print
(
f
"搜索词:{search_word}"
)
base_url
=
"https://twitter.com/"
option
=
[
'--headless'
]
# ['--headless']
browser
=
browser
or
create
(
None
,
True
)
browser
=
browser
or
create
(
option
,
True
)
# print(browser)
# 打开网页
browser
.
get
(
base_url
)
...
...
@@ -48,26 +54,29 @@ def reptile(browser=None, search_word=""):
browser
.
get
(
url
)
time
.
sleep
(
4
)
base_xpath
=
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
# 内容块
element_content_list
=
browser
.
find_elements
(
'xpath'
,
base_xpath
)
element_content_list
=
browser
.
find_elements
(
'xpath'
,
base_xpath
)
# 作者
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
f
"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']"
)
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
f
"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']"
)
length
=
len
(
element_authors_list
)
for
index
in
range
(
length
):
# print(index)
soup
=
BeautifulSoup
(
element_content_list
[
index
]
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
soup
=
BeautifulSoup
(
element_content_list
[
index
]
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
# 查找time标签
time_soup
=
soup
.
find
(
'time'
)
timestamp
=
datetime
.
fromisoformat
(
time_soup
[
'datetime'
]
.
replace
(
"Z"
,
"+00:00"
))
.
timestamp
()
link_soup
=
time_soup
.
parent
link_str
=
base_url
+
link_soup
[
"href"
]
try
:
time_soup
=
soup
.
find
(
'time'
)
timestamp
=
datetime
.
fromisoformat
(
time_soup
[
'datetime'
]
.
replace
(
"Z"
,
"+00:00"
))
.
timestamp
()
link_soup
=
time_soup
.
parent
link_str
=
base_url
+
link_soup
[
"href"
]
except
:
link_str
=
""
timestamp
=
time
.
time
()
author
=
element_authors_list
[
index
]
.
text
# 标题取:作者+日期
title
=
f
"{author}-{datetime.fromtimestamp(int(timestamp))}"
video_list
=
soup
.
find_all
(
"video"
)
image_list
=
soup
.
find_all
(
"img"
)
# lth = len(ignore_list)
...
...
@@ -111,7 +120,6 @@ def reptile(browser=None, search_word=""):
print
(
""
)
content
=
soup
.
prettify
()
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
...
...
@@ -133,7 +141,8 @@ def reptile(browser=None, search_word=""):
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"author"
:
author
,
"releaseTime"
:
str
(
int
(
timestamp
))
"releaseTime"
:
str
(
int
(
timestamp
)),
"picture_url"
:
","
.
join
(
picture_url
)
}
# --------------- 组装数据 end---------------------
data
.
append
(
obj
)
...
...
@@ -189,7 +198,7 @@ def main():
# 全局变量
data
=
[]
table_name
=
"pms_twitter"
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",table_name.split("_")[1])}'
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data",
table_name.split("_")[1])}'
# 是否启用
status_task
=
'0'
# 调用main函数
...
...
pc_youtube.py
View file @
1f24eff4
...
...
@@ -21,7 +21,7 @@ def reptile(browser=None, search_word=""):
:return:
"""
option
=
[
'--headless'
]
browser
=
browser
or
create
([
'--headless'
])
browser
=
browser
or
create
([
'--headless'
]
,
True
)
# print(browser)
# 打开网页
url
=
f
'https://www.youtube.com/results?search_query={search_word}'
...
...
utils/createBrowserDriver.py
View file @
1f24eff4
...
...
@@ -8,65 +8,99 @@ from selenium.webdriver.common.by import By
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.webdriver.support.ui
import
WebDriverWait
import
chromedriver_autoinstaller
from
selenium.webdriver.firefox.firefox_profile
import
FirefoxProfile
# from mozprofile import FirefoxProfile
'''
创建浏览器实例
'''
def
create
(
option
=
None
,
using_user_data
=
True
):
def
create
(
option
=
None
,
using_user_data
=
True
,
web_browser
=
"firefox"
):
"""
:param web_browser:
:param using_user_data:
:param option:
:return:
"""
# 安装或升级 chromedriver
chromedriver_autoinstaller
.
install
()
#
chromedriver_autoinstaller.install()
# 获取现有Chrome浏览器用户数据目录
# chrome_user_data_dir = ""
# if platform.system() == 'Windows':
# chrome_user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Google', 'Chrome',
# 'User Data')
# elif platform.system() == 'Linux':
# chrome_user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'google-chrome')
# elif platform.system() == 'Darwin':
# chrome_user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Google','Chrome')
# else:
# raise Exception('Unsupported operating system')
def
get_user_data_dir
():
"""
:return:
"""
# 获取现有Chrome浏览器用户数据目录
user_data_dir
=
os
.
path
.
join
(
os
.
path
.
abspath
(
"../"
),
'network-assets-reptile'
,
'user_data'
)
# if platform.system() == 'Windows':
# if web_browser == "firefox":
# user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Mozilla', 'Firefox',
# 'Profiles')
# else:
# user_data_dir = os.path.join(os.environ['USERPROFILE'], 'AppData', 'Local', 'Google', 'Chrome',
# 'User Data')
# elif platform.system() == 'Linux':
# if web_browser == "firefox":
# user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'Firefox', 'Profiles','huqg7mpy.default-release')
# else:
# user_data_dir = os.path.join(os.path.expanduser('~'), '.config', 'google-chrome')
# elif platform.system() == 'Darwin':
# if web_browser == "firefox":
# user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Firefox',
# 'Profiles','huqg7mpy.default-release')
# else:
# user_data_dir = os.path.join(os.path.expanduser("~"), 'Library', 'Application Support', 'Google',
# 'Chrome')
# else:
# raise Exception('Unsupported operating system')
return
user_data_dir
chrome_options
=
webdriver
.
ChromeOptions
()
options
=
""
browser
=
""
if
web_browser
==
"firefox"
:
options
=
webdriver
.
FirefoxOptions
()
else
:
options
=
webdriver
.
ChromeOptions
()
if
option
is
not
None
:
for
value
in
option
:
chrome_
options
.
add_argument
(
value
)
options
.
add_argument
(
value
)
# 启用浏览器的持久性会话,可以保存登录状态和Cookie
# 使用本地
user_data_dir
=
os
.
path
.
join
(
os
.
path
.
abspath
(
"../"
),
'network-assets-reptile'
,
'user_data'
)
if
using_user_data
:
# 添加用户数据目录参数
chrome_options
.
add_argument
(
f
'--user-data-dir={user_data_dir}'
)
# 添加用户数据目录参数,启用浏览器的持久性会话,可以保存登录状态和Cookie
if
web_browser
==
"firefox"
:
firefox_profile_path
=
get_user_data_dir
()
# 将此处替换为你的Firefox用户数据目录路径
profile
=
FirefoxProfile
(
profile_directory
=
firefox_profile_path
)
options
.
profile
=
profile
else
:
options
.
add_argument
(
f
'--user-data-dir={get_user_data_dir()}'
)
if
sys
.
platform
.
startswith
(
'linux'
):
# print("当前系统是 Linux")
# linux下运行记得加上这些参数 ----------------------------
# chrome_options.add_argument('--headless')
chrome_options
.
add_argument
(
'--no-sandbox'
)
chrome_options
.
add_argument
(
'--disable-gpu'
)
chrome_options
.
add_argument
(
'--disable-dev-shm-usage'
)
# 加载chromedriver -------------------------------------------------
# windows 下的 chromedriver 默认加载路径是当前路径下的 chromedriver.exe
# linux 下的 chromedriver 默认加载路径是 /usr/local/bin/chromedriver
# 当然也可以通过 executable_path 自定义
browser
=
webdriver
.
Chrome
(
options
=
chrome_options
)
# -----------------------------------------------------------------
# if sys.platform.startswith('linux'):
# print("当前系统是 Linux")
# linux下运行记得加上这些参数 ----------------------------
# chrome_options.add_argument('--headless')
options
.
add_argument
(
"--window-size=1920x1080"
)
# 设置窗口大小,这是一个常见的完全无头模式的设置
options
.
add_argument
(
'--no-sandbox'
)
options
.
add_argument
(
'--disable-gpu'
)
options
.
add_argument
(
'--disable-dev-shm-usage'
)
# 加载chromedriver -------------------------------------------------
# windows 下的 chromedriver 默认加载路径是当前路径下的 chromedriver.exe
# linux 下的 chromedriver 默认加载路径是 /usr/local/bin/chromedriver
# 当然也可以通过 executable_path 自定义
if
web_browser
==
"firefox"
:
browser
=
webdriver
.
Firefox
(
options
=
options
)
else
:
# print("当前系统不是 Linux")
# linux下运行记得加上这些参数 ----------------------------
# chrome_options.add_argument('--headless') # 启用无头模式
chrome_options
.
add_argument
(
'--no-sandbox'
)
# 禁用沙盒模式
# 创建浏览器驱动对象
browser
=
webdriver
.
Chrome
(
options
=
chrome_options
)
browser
=
webdriver
.
Chrome
(
options
=
options
)
# -----------------------------------------------------------------
# else:
# # print("当前系统不是 Linux")
# # linux下运行记得加上这些参数 ----------------------------
# # chrome_options.add_argument('--headless') # 启用无头模式
# options.add_argument('--no-sandbox') # 禁用沙盒模式
# options.add_argument('--disable-gpu')
# options.add_argument('--disable-dev-shm-usage')
# # 创建浏览器驱动对象
# browser = webdriver.Chrome(options=options)
return
browser
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment