Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
794b15d6
Commit
794b15d6
authored
Jul 19, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:facebook 富文本删除视频相关html标签,增加video标签占位
parent
5773068e
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
62 additions
and
24 deletions
+62
-24
settings.py
config/settings.py
+2
-2
pc_facebook.py
pc_facebook.py
+58
-20
.gitkeep
reptile_data/facebook/.gitkeep
+0
-0
index.py
utils/index.py
+2
-2
No files found.
config/settings.py
View file @
794b15d6
...
...
@@ -3,7 +3,7 @@ def get_log_path():
return
"../"
def
get_base_url
():
return
"http://192.168.0.1
27
:8081/"
return
"http://192.168.0.1
18
:8081/"
def
get_base_file_url
():
return
"http://192.168.0.127:8186/"
\ No newline at end of file
return
"http://192.168.0.118:8186/"
\ No newline at end of file
pc_facebook.py
View file @
794b15d6
...
...
@@ -5,7 +5,7 @@ from utils.Logger import log
from
utils.createBrowserDriver
import
create
from
utils.filse
import
save_json
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
,
parse_time_string
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
,
parse_time_string
# from pytube import YouTube
from
datetime
import
datetime
import
os
...
...
@@ -25,7 +25,7 @@ def reptile(browser=None, search_word=""):
browser
.
get
(
url
)
try
:
# 检测是否要登录
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='email']"
)
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='email']"
)
password_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='pass']"
)
login_input
.
send_keys
(
"liyang19970814@gmail.com"
)
password_input
.
send_keys
(
"xn89kiPT/^Kaeg#"
)
...
...
@@ -34,37 +34,74 @@ def reptile(browser=None, search_word=""):
button_login
.
click
()
time
.
sleep
(
3
)
except
:
print
(
"
error
"
)
print
(
"
已登录
"
)
url
=
f
"https://www.facebook.com/search/top?q={search_word}"
browser
.
get
(
url
)
# 使用 JavaScript 将网页滚动到底部
browser
.
execute_script
(
"window.scrollTo(0, document.body.scrollHeight);"
)
time
.
sleep
(
3
)
# 帖子块集合
elements
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]"
)
# 内容
element_content_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]"
)
# 作者
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]"
)
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]"
)
# 发布时间
element_release_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]"
)
# 查找所有 展开 按钮,循环点击后在查找内容
elements_expand_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']"
)
for
item
in
elements_expand_list
:
item
.
click
()
# 内容
element_content_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]"
)
# print(element_content_list)
length
=
len
(
elements
)
elements_expand_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']"
)
for
key
,
element
in
enumerate
(
elements_expand_list
):
try
:
# 使用JavaScript 执行点击操作
browser
.
execute_script
(
"arguments[0].click();"
,
element
)
except
Exception
as
e
:
print
(
"Clicking element failed: "
+
str
(
e
))
length
=
len
(
element_content_list
)
# print(length)
for
index
in
range
(
length
):
author
=
element_authors_list
[
index
]
.
text
release_time
=
str
(
int
(
parse_time_string
(
element_release_list
[
index
]
.
text
)))
content
=
element_content_list
[
index
]
.
get_attribute
(
"outerHTML"
)
# print(element_release_list[index].text)
# print(parse_time_string(element_release_list[index].text))
release_time_timestamp
=
int
(
parse_time_string
(
element_release_list
[
index
]
.
text
))
release_time
=
str
(
release_time_timestamp
)
# release_time = ""
# content = element_content_list[index].get_attribute("outerHTML")
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
element_content_list
[
index
]
.
get_attribute
(
'innerHTML'
),
'html.parser'
)
text
=
element_content_list
[
index
]
.
text
soup
=
BeautifulSoup
(
element_content_list
[
index
]
.
get_attribute
(
'outerHTML'
),
'html.parser'
)
soup_str
=
soup
.
prettify
()
# 查找是否含有视频
# ignore_list = soup.find_all("div", {"data-visualcompletion": "video"})
video_list
=
soup
.
find_all
(
"video"
)
# lth = len(ignore_list)
if
len
(
video_list
)
>
0
:
# 删除第二个子元素
# 找到包含两个 <div> 元素的父级元素
parent_div
=
soup
.
find
(
'div'
)
# 找到所有的 <div> 子元素
div_elements
=
parent_div
.
find_all
(
'div'
,
recursive
=
False
)
# div_tags = soup.find_all("div", recursive=False)
# 确保列表中至少有两个 <div> 子元素
if
len
(
div_elements
)
>=
2
:
# 获取第二个 <div> 元素,并将其从父级元素中移除
div_to_remove
=
div_elements
[
1
]
div_to_remove
.
extract
()
# 删除
# div.decompose()
# 创建video标签占位
custom_video
=
soup
.
new_tag
(
"video"
)
custom_video
[
"src"
]
=
""
parent_div
.
append
(
custom_video
)
else
:
print
(
""
)
content
=
soup
.
prettify
()
# 标题取:作者+日期
title
=
f
"{author}-{datetime.fromtimestamp(int(parse_time_string(element_release_list[index].text)))}"
title
=
f
"{author}-{datetime.fromtimestamp(release_time_timestamp)}"
# title = ""
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
...
...
@@ -183,6 +220,7 @@ def reptile(browser=None, search_word=""):
browser
.
quit
()
def
main
():
"""
...
...
reptile_data/facebook/.gitkeep
deleted
100644 → 0
View file @
5773068e
utils/index.py
View file @
794b15d6
...
...
@@ -38,9 +38,9 @@ def parse_time_string(time_str):
else
:
try
:
datetime_str
=
time_str
.
replace
(
"月"
,
" "
)
.
replace
(
"日"
,
""
)
month
,
day
,
hour
,
minute
=
map
(
int
,
datetime_str
.
split
())
month
,
day
=
map
(
int
,
datetime_str
.
split
())
current_year
=
datetime
.
datetime
.
now
()
.
year
datetime_obj
=
datetime
.
datetime
(
year
=
current_year
,
month
=
month
,
day
=
day
,
hour
=
hour
,
minute
=
minute
)
datetime_obj
=
datetime
.
datetime
(
year
=
current_year
,
month
=
month
,
day
=
day
)
return
int
(
datetime_obj
.
timestamp
())
except
ValueError
:
return
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment