Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
c959a447
Commit
c959a447
authored
Jul 28, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:twitter 过滤
parent
0832e447
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
12 deletions
+14
-12
pc_twitter.py
pc_twitter.py
+14
-12
No files found.
pc_twitter.py
View file @
c959a447
...
...
@@ -37,7 +37,6 @@ def reptile(browser=None, search_word=""):
print
(
f
"搜索词:{search_word}"
)
base_url
=
"https://twitter.com/"
browser
=
browser
or
create
(
no_headless
=
False
,
using_user_data
=
True
)
# print(browser)
# 打开网页
browser
.
get
(
base_url
)
time
.
sleep
(
2
)
...
...
@@ -66,7 +65,6 @@ def reptile(browser=None, search_word=""):
button_login
.
click
()
time
.
sleep
(
2
)
except
:
# print("------")
error
=
""
url
=
'https://twitter.com/search?q='
+
search_word
+
'&src=typed_query'
...
...
@@ -82,7 +80,6 @@ def reptile(browser=None, search_word=""):
f
"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']"
)
length
=
len
(
element_authors_list
)
for
index
in
range
(
length
):
# print(index)
soup
=
BeautifulSoup
(
element_content_list
[
index
]
.
get_attribute
(
"outerHTML"
),
"html.parser"
)
# 查找time标签
try
:
...
...
@@ -105,14 +102,23 @@ def reptile(browser=None, search_word=""):
div_elements
=
soup
.
find
(
"div"
)
.
findChildren
(
"div"
,
recursive
=
False
)
# div_tags = soup.find_all("div", recursive=False)
for
item
in
video_list
:
div
=
soup
.
new_tag
(
'div'
)
img_tag
=
soup
.
new_tag
(
'img'
)
img_tag
[
"src"
]
=
item
[
"poster"
]
div
.
append
(
img_tag
)
for
items
in
div_elements
:
if
hasattr
(
items
,
"aria-labelledby"
):
attr
=
False
try
:
attr
=
items
[
"aria-labelledby"
]
except
:
attr
=
False
if
attr
:
# div["aria-labelledby"] = "sdfsf"
# div[@aria-labelledby="xx"] 替换为img标签【内容含有视频的替换为img标签】
items
.
replaceWith
(
img_tag
)
items
.
replaceWith
(
div
)
else
:
error
=
""
else
:
# print("")
error
=
""
image_list
=
soup
.
find_all
(
"img"
)
...
...
@@ -136,7 +142,6 @@ def reptile(browser=None, search_word=""):
element
[
'src'
]
=
access_address
picture_url
.
append
(
download_dir
)
else
:
# print("")
error
=
""
# 删除多余div
...
...
@@ -148,7 +153,6 @@ def reptile(browser=None, search_word=""):
item
.
extract
()
content
=
soup
.
prettify
()
print
(
""
)
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
...
...
@@ -176,10 +180,9 @@ def reptile(browser=None, search_word=""):
}
# --------------- 组装数据 end---------------------
data
.
append
(
obj
)
soup
=
""
time
.
sleep
(
0.1
)
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if
len
(
data
)
>
0
:
# 保存json文件到本地
# log.debug(os.path.abspath("../"))
...
...
@@ -212,7 +215,6 @@ def main():
# 请求关键词
response
=
getReptileTask
()
global
status_task
# print(response)
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
search_word
=
""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment