Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
46ec2eee
Commit
46ec2eee
authored
Jul 25, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:ins爬虫数据条件过滤优化
parent
6774ae74
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
27 additions
and
10 deletions
+27
-10
pc_instagram.py
pc_instagram.py
+3
-2
createBrowserDriver.py
utils/createBrowserDriver.py
+15
-3
download_image.py
utils/download_image.py
+9
-5
No files found.
pc_instagram.py
View file @
46ec2eee
...
...
@@ -36,7 +36,7 @@ def reptile(browser=None, search_word=""):
base_url
=
"https://www.instagram.com/"
option
=
[
'--headless'
]
# ['--headless']
browser
=
browser
or
create
(
None
,
True
)
browser
=
browser
or
create
(
option
,
True
)
# print(browser)
# 打开网页
browser
.
get
(
base_url
)
...
...
@@ -123,7 +123,8 @@ def reptile(browser=None, search_word=""):
# print(img_soup.prettify())
soup
.
append
(
img_soup
)
picture_url
.
append
(
download_dir
)
else
:
picture_url
.
append
(
""
)
content
=
soup
.
prettify
()
# 类型
content_type
=
"图文"
...
...
utils/createBrowserDriver.py
View file @
46ec2eee
import
os
import
platform
import
sys
from
selenium
import
webdriver
# --------------- selenium 依赖 start ----------------
...
...
@@ -30,6 +31,7 @@ from utils.index import get_screen_resolution
def
create
(
option
=
None
,
using_user_data
=
True
,
web_browser
=
"firefox"
):
"""
生成selenium实例
:param web_browser:
:param using_user_data:
...
...
@@ -82,9 +84,19 @@ def create(option=None, using_user_data=True, web_browser="firefox"):
# chrome_options.add_argument('--headless')
# options.add_argument("--window-size=1920x1080") # 设置窗口大小,这是一个常见的完全无头模式的设置
# options.add_argument("--start-maximized") # 最大化窗口
options
.
add_argument
(
'--no-sandbox'
)
if
option
!=
None
:
# 无头模式下禁用gpu加速
options
.
add_argument
(
'--disable-gpu'
)
# 无头模式-linux 系统
if
option
!=
None
and
platform
.
system
()
==
"Linux"
:
'''
--disable-dev-shm-usage 是 Chrome 浏览器在无头模式下运行时的一个常用启动参数。在 Linux 系统下特别常见,通过这个参数,Chrome 浏览器会禁用对 /dev/shm 的使用。
'''
options
.
add_argument
(
'--disable-dev-shm-usage'
)
# 禁用沙盒模式
options
.
add_argument
(
'--no-sandbox'
)
# 加载chromedriver -------------------------------------------------
# windows 下的 chromedriver 默认加载路径是当前路径下的 chromedriver.exe
# linux 下的 chromedriver 默认加载路径是 /usr/local/bin/chromedriver
...
...
utils/download_image.py
View file @
46ec2eee
import
requests
import
os
def
download_image
(
url
,
save_path
):
"""
下载图片并保存到本地文件
:param url:
:param save_path:
:return:
:param url:
图片的 URL 地址
:param save_path:
图片保存的文件路径
:return:
下载成功返回 True,下载失败返回 False
"""
if
os
.
path
.
exists
(
save_path
):
# print(f"图片文件已存在:{save_path}")
return
True
response
=
requests
.
get
(
url
,
stream
=
True
)
if
response
.
status_code
==
200
:
with
open
(
save_path
,
'wb'
)
as
file
:
...
...
@@ -16,5 +20,5 @@ def download_image(url, save_path):
# print(f"图片下载成功:{save_path}")
return
True
else
:
print
(
f
"图片下载失败:{url}"
)
#
print(f"图片下载失败:{url}")
return
False
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment