Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
5773068e
Commit
5773068e
authored
Jul 18, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:针对twitter反爬虫定向抓取数据
parent
e0c2ddfc
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
34 additions
and
48 deletions
+34
-48
pc_facebook.py
pc_facebook.py
+6
-25
pc_ptt.py
pc_ptt.py
+1
-1
pc_twitter.py
pc_twitter.py
+26
-21
index.py
utils/index.py
+1
-1
No files found.
pc_facebook.py
View file @
5773068e
...
...
@@ -7,6 +7,7 @@ from utils.filse import save_json
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
,
parse_time_string
# from pytube import YouTube
from
datetime
import
datetime
import
os
from
config.settings
import
get_base_file_url
...
...
@@ -19,16 +20,9 @@ from config.settings import get_base_file_url
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://www.facebook.com/"
option
=
[
'--headless'
]
# ['--headless']
browser
=
browser
or
create
(
option
)
# year = datetime(2021, 1, 1)
# startDate = datetime(2020, 12, 31) # 初始日期
# endDate = datetime(2020, 12, 31) # 结束日期
# print(browser)
# 打开网页
browser
.
get
(
url
)
# print("00000000000000000")
# time.sleep(3)
try
:
# 检测是否要登录
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='email']"
)
...
...
@@ -40,25 +34,17 @@ def reptile(browser=None, search_word=""):
button_login
.
click
()
time
.
sleep
(
3
)
except
:
# print("------")
a
=
1
# time.sleep(3)
print
(
"error"
)
url
=
f
"https://www.facebook.com/search/top?q={search_word}"
browser
.
get
(
url
)
# 使用 JavaScript 将网页滚动到底部
browser
.
execute_script
(
"window.scrollTo(0, document.body.scrollHeight);"
)
time
.
sleep
(
3
)
# 帖子块集合
elements
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]"
)
# print(333333)
# time.sleep(3)
# 作者
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]"
)
# print(element_authors_list)
# print("2222")
# 发布时间
element_release_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]"
)
...
...
@@ -66,24 +52,19 @@ def reptile(browser=None, search_word=""):
elements_expand_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']"
)
for
item
in
elements_expand_list
:
item
.
click
()
# time.sleep(2)
# 内容
element_content_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]"
)
# print(element_content_list)
length
=
len
(
elements
)
# print(length)
for
index
in
range
(
length
):
author
=
element_authors_list
[
index
]
.
text
# el = element_release_list[index]
# # datetime_el = el.get_attribute("datetime")
# html = el.text
# 去除时间字符串中包含的html标签
# BeautifulSoup(element_release_list[index].get_attribute("innerHTML"),"html.parser").get_text()
release_time
=
str
(
int
(
parse_time_string
(
element_release_list
[
index
]
.
text
)))
content
=
element_content_list
[
index
]
.
get_attribute
(
"
inn
erHTML"
)
content
=
element_content_list
[
index
]
.
get_attribute
(
"
out
erHTML"
)
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
element_content_list
[
index
]
.
get_attribute
(
'innerHTML'
),
'html.parser'
)
# 标题取:作者+日期
title
=
f
"{author}-{datetime.fromtimestamp(int(parse_time_string(element_release_list[index].text)))}"
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
...
...
@@ -99,7 +80,7 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 end ----------
# --------------- 组装数据 start---------------------
obj
=
{
"title"
:
""
,
"title"
:
title
,
"content"
:
content
,
"link"
:
element_release_list
[
index
]
.
get_attribute
(
"href"
),
"reptileTime"
:
str
(
int
(
time
.
time
())),
...
...
pc_ptt.py
View file @
5773068e
...
...
@@ -79,7 +79,7 @@ def reptile(browser=None, search_word=""):
# 去除herf属性值包含'img'的a标签
# ------------------------------------
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
element_content
.
get_attribute
(
'
inn
erHTML'
),
'html.parser'
)
soup
=
BeautifulSoup
(
element_content
.
get_attribute
(
'
out
erHTML'
),
'html.parser'
)
# 作者
element_author
=
browser
.
find_element
(
'xpath'
,
"//div[@id='main-content']/div[@class='article-metaline'][1]/span[2]"
)
...
...
pc_twitter.py
View file @
5773068e
...
...
@@ -8,6 +8,7 @@ from api.index import importJson, getReptileTask, importJsonPath
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
,
parse_twitter_time_string
# from pytube import YouTube
import
os
from
datetime
import
datetime
from
config.settings
import
get_base_file_url
# 工具函数-下载图片
...
...
@@ -17,13 +18,13 @@ from config.settings import get_base_file_url
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://twitter.com/"
base_
url
=
"https://twitter.com/"
option
=
[
'--headless'
]
# ['--headless']
browser
=
browser
or
create
(
None
,
False
)
# print(browser)
# 打开网页
browser
.
get
(
url
)
browser
.
get
(
base_
url
)
time
.
sleep
(
3
)
try
:
# 检测是否要登录
...
...
@@ -43,29 +44,32 @@ def reptile(browser=None, search_word=""):
time
.
sleep
(
2
)
url
=
'https://twitter.com/search?q='
+
search_word
+
'&src=typed_query'
browser
.
get
(
url
)
time
.
sleep
(
3
)
time
.
sleep
(
4
)
base_xpath
=
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
# 内容块
element_content_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
)
element_content_list
=
browser
.
find_elements
(
'xpath'
,
base_xpath
)
# 作者
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']/div[1]//a[@role='link']"
)
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
f
"{base_xpath}//div[@data-testid='User-Name']/div[1]//a[@role='link']"
)
# time.sleep(2
)
# 发布时间
element_release_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']//div[2]//time[@datetime]
"
)
# element_release_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time")
# time_a_list = browser.find_elements('xpath',f"{base_xpath}//div[@data-testid='User-Name']/div[2]//a/time/..
")
# print(element_content_list)
length
=
len
(
element_authors_list
)
for
index
in
range
(
length
):
# print(index)
content
=
element_content_list
[
index
]
.
get_attribute
(
"outerHTML"
)
soup
=
BeautifulSoup
(
content
,
"html.parser"
)
# 查找time标签
time_soup
=
soup
.
find
(
'time'
)
timestamp
=
datetime
.
fromisoformat
(
time_soup
[
'datetime'
]
.
replace
(
"Z"
,
"+00:00"
))
.
timestamp
()
link_soup
=
time_soup
.
parent
link_str
=
base_url
+
link_soup
[
"href"
]
author
=
element_authors_list
[
index
]
.
text
try
:
release_time
=
str
(
int
(
parse_twitter_time_string
(
element_release_list
[
index
]
.
text
)))
except
:
release_time
=
str
(
int
(
time
.
time
()))
content
=
element_content_list
[
index
]
.
get_attribute
(
"innerHTML"
)
# print(content)
# 内容过滤
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
content
,
'html.parser'
)
# 标题取:作者+日期
title
=
f
"{author}-{datetime.fromtimestamp(int(timestamp))}"
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
...
...
@@ -81,15 +85,16 @@ def reptile(browser=None, search_word=""):
# ---------------- 判断类型 end ----------
# --------------- 组装数据 start---------------------
obj
=
{
"title"
:
""
,
"title"
:
title
,
"content"
:
content
,
"link"
:
""
,
"link"
:
link_str
,
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"author"
:
author
,
"releaseTime"
:
release_time
"releaseTime"
:
str
(
int
(
timestamp
))
}
# --------------- 组装数据 end---------------------
data
.
append
(
obj
)
# 发送爬取数据到java服务
# print('----------------------')
...
...
utils/index.py
View file @
5773068e
...
...
@@ -22,7 +22,7 @@ def parse_time_string(time_str):
:param time_str:
:return:
"""
log
.
debug
(
f
'转换face4book的发布时间:{time_str}'
)
#
log.debug(f'转换face4book的发布时间:{time_str}')
if
"天"
in
time_str
:
number
=
int
(
time_str
.
split
(
"天"
)[
0
])
time_delta
=
datetime
.
timedelta
(
days
=
number
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment