Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
d12c76af
Commit
d12c76af
authored
Jul 25, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:ins爬虫数据条件过滤优化
parent
bda48e6b
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
42 additions
and
23 deletions
+42
-23
settings.py
config/settings.py
+1
-1
pc_instagram.py
pc_instagram.py
+33
-18
test.py
test.py
+8
-4
No files found.
config/settings.py
View file @
d12c76af
...
@@ -4,7 +4,7 @@ def get_log_path():
...
@@ -4,7 +4,7 @@ def get_log_path():
def
get_base_url
():
def
get_base_url
():
return
"http://192.168.0.1
18
:8081/"
return
"http://192.168.0.1
04
:8081/"
def
get_base_file_url
():
def
get_base_file_url
():
...
...
pc_instagram.py
View file @
d12c76af
...
@@ -72,7 +72,8 @@ def reptile(browser=None, search_word=""):
...
@@ -72,7 +72,8 @@ def reptile(browser=None, search_word=""):
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@role='dialog']/div/div[2]"
)))
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//div[@role='dialog']/div/div[2]"
)))
# 提取其他
# 提取其他
author
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]//a"
)
author
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[1]/div/header/div[2]/div[1]/div[1]//a"
)
content_element
=
browser
.
find_element
(
"xpath"
,
content_element
=
browser
.
find_element
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1"
)
"//div[@role='dialog']/div//article/div/div[2]/div/div/div[2]/div[1]//div[@role='button']//h1"
)
...
@@ -90,21 +91,35 @@ def reptile(browser=None, search_word=""):
...
@@ -90,21 +91,35 @@ def reptile(browser=None, search_word=""):
# 过滤视频
# 过滤视频
video_list
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[1]/div/div[1]//video"
)
video_list
=
browser
.
find_elements
(
"xpath"
,
"//div[@role='dialog']/div//article/div/div[1]/div/div[1]//video"
)
for
key
,
item
in
enumerate
(
img_list
):
for
key
,
item
in
enumerate
(
img_list
):
src
=
""
img_soup
=
""
if
len
(
video_list
)
==
0
:
if
len
(
video_list
)
==
0
:
if
key
==
0
:
if
key
==
0
:
title
=
item
.
get_attribute
(
"alt"
)
title_str_list
=
item
.
get_attribute
(
"alt"
)
.
split
(
"'"
)
# 下载图片至本地,替换标签中的src
if
len
(
title_str_list
)
>=
3
:
id
=
str
(
int
(
time
.
time
()))
title
=
title_str_list
[
2
]
else
:
title
=
""
img_soup
=
BeautifulSoup
(
item
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
.
find
(
"img"
)
src
=
item
.
get_attribute
(
"src"
)
else
:
# 有视频,图片链接从列表中提取
title
=
""
a_soup
=
BeautifulSoup
(
element_link_list
[
index
]
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
# img_element = element_link_list[index].find_element("xpath","img")
img_soup
=
a_soup
.
find
(
"img"
)
src
=
img_soup
[
"src"
]
str_list
=
link_str
.
split
(
"/"
)
img_id
=
str_list
[
len
(
str_list
)
-
2
]
# 下载地址
# 下载地址
download_dir
=
f
'{os.path.join(file_dir, f"{
id}.jpg")}'
download_dir
=
f
'{os.path.join(file_dir, f"{img_
id}.jpg")}'
# 访问地址
# 访问地址
access_address
=
f
'{get_base_file_url()}{table_name.split("_")[1]}/{
id}.jpg'
access_address
=
f
'{get_base_file_url()}{table_name.split("_")[1]}/{img_
id}.jpg'
# 下载状态
# 下载状态
status
=
download_image
(
item
.
get_attribute
(
"src"
)
,
download_dir
)
status
=
download_image
(
src
,
download_dir
)
if
status
:
if
status
:
# 将图片追加到内容中
# 将图片追加到内容中
img_soup
=
BeautifulSoup
(
item
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
img_soup
[
"src"
]
=
access_address
img_soup
.
img
[
"src"
]
=
access_address
# print(img_soup.prettify())
# print(img_soup.prettify())
soup
.
append
(
img_soup
)
soup
.
append
(
img_soup
)
picture_url
.
append
(
access_address
)
picture_url
.
append
(
access_address
)
...
@@ -176,7 +191,7 @@ def main():
...
@@ -176,7 +191,7 @@ def main():
log
.
debug
(
"call success"
)
log
.
debug
(
"call success"
)
search_word
=
""
search_word
=
""
for
item
in
response
[
'data'
][
'rows'
]:
for
item
in
response
[
'data'
][
'rows'
]:
if
item
[
'name'
]
==
'
pms_
instagram'
:
if
item
[
'name'
]
==
'instagram'
:
search_word
=
item
[
'keyword'
]
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
table_name
=
item
[
'tableName'
]
status_task
=
int
(
item
[
"status"
])
status_task
=
int
(
item
[
"status"
])
...
...
test.py
View file @
d12c76af
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment