Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
network-assets-reptile
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
liyang
network-assets-reptile
Commits
023912ec
Commit
023912ec
authored
Jul 14, 2023
by
liyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix:更新facebook和twitter
parent
ab634c7f
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
682 additions
and
150 deletions
+682
-150
index.py
api/index.py
+54
-15
pc_facebook.py
pc_facebook.py
+224
-83
pc_instagram.py
pc_instagram.py
+243
-0
pc_ptt.py
pc_ptt.py
+11
-13
pc_twitter.py
pc_twitter.py
+90
-29
pc_youtube.py
pc_youtube.py
+8
-6
createBrowserDriver.py
utils/createBrowserDriver.py
+4
-2
index.py
utils/index.py
+48
-2
No files found.
api/index.py
View file @
023912ec
import
requests
import
requests
from
requests.exceptions
import
Timeout
from
config.settings
import
get_base_url
from
config.settings
import
get_base_url
headers
=
{}
headers
=
{}
baser_url
=
get_base_url
()
baser_url
=
get_base_url
()
# 设置超时时间为5秒
timeout
=
3
def
custom_request
(
method
,
url
,
*
args
,
**
kwargs
):
"""
:param method:
:param url:
:param args:
:param kwargs:
:return:
"""
try
:
response
=
requests
.
request
(
method
,
url
,
timeout
=
timeout
,
*
args
,
**
kwargs
)
response
=
{
"status_code"
:
response
.
status_code
,
"data"
:
response
.
json
()}
except
Timeout
:
response
=
{
"status_code"
:
500
}
# 将响应内容解析为 JSON 格式
return
response
def
importJson
(
file
,
form_data
):
def
importJson
(
file
,
form_data
):
"""
:param file:
:param form_data:
:return:
"""
http_url
=
baser_url
+
"importJson/import"
http_url
=
baser_url
+
"importJson/import"
response
=
requests
.
post
(
http_url
,
headers
=
headers
,
files
=
{
"file"
:
file
},
data
=
form_data
)
response
=
custom_request
(
'POST'
,
http_url
,
headers
=
headers
,
files
=
{
"file"
:
file
},
data
=
form_data
)
# 将响应内容解析为 JSON 格式
return
response
return
{
"status_code"
:
response
.
status_code
,
"data"
:
response
.
json
()}
def
getReptileTask
():
def
getReptileTask
():
"""
:return:
"""
http_url
=
baser_url
+
"crawlerSetting/list"
http_url
=
baser_url
+
"crawlerSetting/list"
response
=
requests
.
get
(
http_url
,
headers
=
headers
)
response
=
custom_request
(
'GET'
,
http_url
)
# 将响应内容解析为 JSON 格式
return
response
return
{
"status_code"
:
response
.
status_code
,
"data"
:
response
.
json
()}
def
importJsonPath
(
form_data
):
def
importJsonPath
(
form_data
):
headers
=
{
"Content-Type"
:
"application/json"
}
"""
:param form_data:
:return:
"""
header
=
{
"Content-Type"
:
"application/json"
}
http_url
=
baser_url
+
"importJson/importJsonPath"
http_url
=
baser_url
+
"importJson/importJsonPath"
response
=
requests
.
post
(
http_url
,
headers
=
headers
,
data
=
form_data
)
response
=
custom_request
(
'POST'
,
http_url
,
headers
=
header
,
data
=
form_data
)
# 将响应内容解析为 JSON 格式
return
response
return
{
"status_code"
:
response
.
status_code
,
"data"
:
response
.
json
()}
def
runingPython
(
form_data
):
def
runingPython
(
form_data
):
headers
=
{
"Content-Type"
:
"application/json"
}
"""
:param form_data:
:return:
"""
header
=
{
"Content-Type"
:
"application/json"
}
http_url
=
baser_url
+
"python/startPy"
http_url
=
baser_url
+
"python/startPy"
response
=
requests
.
post
(
http_url
,
headers
=
headers
,
data
=
form_data
)
response
=
custom_request
(
'POST'
,
http_url
,
headers
=
header
,
data
=
form_data
)
# 将响应内容解析为 JSON 格式
return
response
return
{
"status_code"
:
response
.
status_code
,
"data"
:
response
.
json
()}
pc_facebook.py
View file @
023912ec
# # 导入依赖库
import
json
import
time
import
time
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
selenium
import
webdriver
from
utils.Logger
import
log
from
selenium.webdriver.common.by
import
By
from
utils.createBrowserDriver
import
create
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
utils.filse
import
save_json
from
selenium.webdriver.support.ui
import
WebDriverWait
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
,
parse_time_string
# from pytube import YouTube
import
os
from
config.settings
import
get_base_file_url
# 工具函数-下载图片
# 工具函数-下载图片
'''
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''
'''
# # json 数据
data
=
[]
image_key
=
0
fileDir
=
"./reptile_data/news/nytimes/"
# year = datetime(2021, 1, 1)
# startDate = datetime(2020, 12, 31) # 初始日期
# endDate = datetime(2020, 12, 31) # 结束日期
url
=
"https://www.facebook.com/"
# 创建浏览器驱动对象
browser
=
webdriver
.
Chrome
()
# endDate = startDate = startDate + timedelta(days=i)
def
reptile
(
browser
=
None
,
search_word
=
""
):
# 打开网页
url
=
"https://www.facebook.com/"
browser
.
get
(
url
)
option
=
[
'--headless'
]
# ['--headless']
browser
=
browser
or
create
(
option
)
# year = datetime(2021, 1, 1)
# startDate = datetime(2020, 12, 31) # 初始日期
# endDate = datetime(2020, 12, 31) # 结束日期
# print(browser)
# 打开网页
browser
.
get
(
url
)
# print("00000000000000000")
# time.sleep(3)
try
:
# 检测是否要登录
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='email']"
)
password_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='pass']"
)
login_input
.
send_keys
(
"liyang19970814@gmail.com"
)
password_input
.
send_keys
(
"xn89kiPT/^Kaeg#"
)
# 获取登录按钮
button_login
=
browser
.
find_element
(
'xpath'
,
"//button[@name='login']"
)
button_login
.
click
()
time
.
sleep
(
3
)
except
:
# print("------")
a
=
1
# time.sleep(3)
url
=
f
"https://www.facebook.com/search/top?q={search_word}"
browser
.
get
(
url
)
# 使用 JavaScript 将网页滚动到底部
browser
.
execute_script
(
"window.scrollTo(0, document.body.scrollHeight);"
)
time
.
sleep
(
3
)
# 帖子块集合
elements
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]"
)
# print(333333)
# time.sleep(3)
# 作者
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]"
)
# print(element_authors_list)
# print("2222")
# 发布时间
element_release_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]"
)
# 查找所有 展开 按钮,循环点击后在查找内容
elements_expand_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']"
)
for
item
in
elements_expand_list
:
item
.
click
()
# time.sleep(2)
# 内容
element_content_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]"
)
# print(element_content_list)
length
=
len
(
elements
)
# print(length)
for
index
in
range
(
length
):
author
=
element_authors_list
[
index
]
.
text
# el = element_release_list[index]
# # datetime_el = el.get_attribute("datetime")
# html = el.text
# 去除时间字符串中包含的html标签
# BeautifulSoup(element_release_list[index].get_attribute("innerHTML"),"html.parser").get_text()
release_time
=
str
(
int
(
parse_time_string
(
element_release_list
[
index
]
.
text
)))
content
=
element_content_list
[
index
]
.
get_attribute
(
"innerHTML"
)
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
element_content_list
[
index
]
.
get_attribute
(
'innerHTML'
),
'html.parser'
)
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
try
:
# 查找所有img标签
img_tags
=
soup
.
find_all
(
'img'
)
if
len
(
img_tags
)
>
0
:
content_type
=
"图文"
else
:
content_type
=
"文字"
except
:
content_type
=
"文字"
# ---------------- 判断类型 end ----------
# --------------- 组装数据 start---------------------
obj
=
{
"title"
:
""
,
"content"
:
content
,
"link"
:
element_release_list
[
index
]
.
get_attribute
(
"href"
),
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"author"
:
author
,
"releaseTime"
:
release_time
}
# 获取账号密码输入框
# --------------- 组装数据 end---------------------
input_email_element
=
WebDriverWait
(
browser
,
10
)
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//input[@name='email']"
)))
input_pwd_element
=
WebDriverWait
(
browser
,
10
)
.
until
(
EC
.
presence_of_element_located
((
By
.
XPATH
,
"//input[@name='pass']"
)))
# 获取登录按钮
button_login
=
WebDriverWait
(
browser
,
10
)
.
until
(
EC
.
element_to_be_clickable
((
By
.
XPATH
,
"//button[@name='login']"
)))
input_email_element
.
send_keys
(
"liyang19970814@gmail.com"
)
data
.
append
(
obj
)
input_pwd_element
.
send_keys
(
"xn89kiPT/^Kaeg#"
)
# print(content)
button_login
.
click
()
# 内容过滤
# print("---------------")
# 使用BeautifulSoup解析HTML
# print(input_email_element)
# soup = BeautifulSoup(content.get_attribute("innerHTML"), 'html.parser')
# print(input_pwd_element)
# print(soup)
# print(button_login)
# print("-----")
# logger.debug(button)
# print("---------------")
# 模拟点击按钮多次加载更多数据
# print(input_email_element)
# while button.is_enabled():
# print(input_pwd_element)
# time.sleep(2) # 等待一段时间,确保页面加载完毕
# print(button_login)
# try:
# logger.debug(button)
# button.click()
# 模拟点击按钮多次加载更多数据
# button = WebDriverWait(browser, 5).until(
# while button.is_enabled():
# EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
# time.sleep(2) # 等待一段时间,确保页面加载完毕
# except:
# try:
# break
# button.click()
time
.
sleep
(
3
)
# button = WebDriverWait(browser, 5).until(
# 获取完整的分页数据
# EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
page_content
=
browser
.
page_source
# except:
soup
=
BeautifulSoup
(
page_content
,
'html.parser'
)
# break
print
(
"----------"
)
# time.sleep(3)
print
(
soup
)
# 获取完整的分页数据
# list_news = soup.find_all('li', {"class": "css-1l4w6pd"})
# page_content = browser.page_source
# soup = BeautifulSoup(page_content, 'html.parser')
# print("----------")
# print(soup)
# list_news = soup.find_all('li', {"class": "css-1l4w6pd"})
# for index, item in enumerate(list_news):
# for index, item in enumerate(list_news):
# logger.debug(item)
# logger.debug(item)
# # 抓取图片
# # 抓取图片
# image_key = image_key + 1
# image_key = image_key + 1
# url_element = item.find('img', {"class": "css-rq4mmj"})
# url_element = item.find('img', {"class": "css-rq4mmj"})
# image_url = url_element['src'] if url_element else ""
# image_url = url_element['src'] if url_element else ""
# # logger.debug(url)
# # logger.debug(url)
# if image_url:
# if image_url:
# # logger.debug(url)
# # logger.debug(url)
# # # 下载图片
# # # 下载图片
# #
# #
# filename = f"{image_key}.jpg"
# filename = f"{image_key}.jpg"
# # logger.debug(filename)
# # logger.debug(filename)
# # sys.exit()
# # sys.exit()
# download_image(image_url, f'{fileDir}images/{filename}')
# download_image(image_url, f'{fileDir}images/{filename}')
# # 抓取文字
# # 抓取文字
# title_element = item.find('h4', {"class": "css-2fgx4k"})
# title_element = item.find('h4', {"class": "css-2fgx4k"})
# introduction_element = item.find('p', {"class": "css-16nhkrn"})
# introduction_element = item.find('p', {"class": "css-16nhkrn"})
# title = title_element.get_text() if title_element else ""
# title = title_element.get_text() if title_element else ""
# introduction = introduction_element.get_text() if introduction_element else ""
# introduction = introduction_element.get_text() if introduction_element else ""
# news = {
# news = {
# "title": title,
# "title": title,
# "introduction": introduction,
# "introduction": introduction,
# "imageName": filename
# "imageName": filename
# }
# }
# data.append(news)
# data.append(news)
# logger.debug(data)
# logger.debug(data)
# 将数据保存到文件中
# 将数据保存到文件中
# with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
# with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
# json.dump(data, file, indent=2, ensure_ascii=False)
# json.dump(data, file, indent=2, ensure_ascii=False)
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if
len
(
data
)
>
0
:
# 保存json文件到本地
log
.
debug
(
os
.
path
.
abspath
(
"../"
))
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "facebook", str(int(time.time())) + ".json")}'
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save
=
save_json
(
file_dir
,
data
)
# log.debug("-----------------------------")
# write_to_database(data)
if
state_save
:
log
.
debug
(
'save file success'
)
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
# log.debug('file_path:' + file_dir)
# form_data = {
# "path": file_dir,
# "tableName": table_name
# }
# response = importJsonPath(form_data)
else
:
log
.
debug
(
'save file failed'
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
# form_data = {
# "path": "",
# "tableName": table_name
# }
# response = importJsonPath(form_data)
browser
.
close
()
browser
.
quit
()
# 关闭浏览器驱动
browser
.
quit
()
def
main
():
"""
"""
# 请求关键词
response
=
getReptileTask
()
global
status_task
# print(response)
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
search_word
=
""
for
item
in
response
[
'data'
][
'rows'
]:
if
item
[
'name'
]
==
'ptt'
:
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
status_task
=
item
[
"status"
]
# 简体转繁体
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
else
:
log
.
debug
(
"爬取任务未启用"
)
else
:
log
.
debug
(
"call failed"
)
# 请求超时
reptile
(
None
,
convert_to_traditional
(
"新闻"
))
# upload_control()
# 全局变量
data
=
[]
table_name
=
"pms_facebook"
# 调用main函数
main
()
pc_instagram.py
0 → 100644
View file @
023912ec
import
json
import
time
from
bs4
import
BeautifulSoup
from
utils.Logger
import
log
from
utils.createBrowserDriver
import
create
from
utils.filse
import
save_json
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
,
parse_time_string
# from pytube import YouTube
import
os
from
config.settings
import
get_base_file_url
# 工具函数-下载图片
'''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
'''
def
reptile
(
browser
=
None
,
search_word
=
""
):
url
=
"https://www.facebook.com/"
option
=
[
'--headless'
]
# ['--headless']
browser
=
browser
or
create
(
option
)
# year = datetime(2021, 1, 1)
# startDate = datetime(2020, 12, 31) # 初始日期
# endDate = datetime(2020, 12, 31) # 结束日期
# print(browser)
# 打开网页
browser
.
get
(
url
)
# print("00000000000000000")
# time.sleep(3)
try
:
'''
邮箱:liyang19970814@gmail.com
账号:abayomi7742
全名: Abayomi
密码:3Z84UMt)34NZj;T
'''
# 检测是否要登录
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='email']"
)
password_input
=
browser
.
find_element
(
'xpath'
,
"//input[@name='pass']"
)
login_input
.
send_keys
(
"liyang19970814@gmail.com"
)
password_input
.
send_keys
(
"xn89kiPT/^Kaeg#"
)
# 获取登录按钮
button_login
=
browser
.
find_element
(
'xpath'
,
"//button[@name='login']"
)
button_login
.
click
()
time
.
sleep
(
3
)
except
:
# print("------")
a
=
1
# time.sleep(3)
url
=
f
"https://www.facebook.com/search/top?q={search_word}"
browser
.
get
(
url
)
# 使用 JavaScript 将网页滚动到底部
browser
.
execute_script
(
"window.scrollTo(0, document.body.scrollHeight);"
)
time
.
sleep
(
3
)
# 帖子块集合
elements
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]"
)
# print(333333)
# time.sleep(3)
# 作者
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//h3/span[1]"
)
# print(element_authors_list)
# print("2222")
# 发布时间
element_release_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//span[@dir]/span//a[@role='link' and @aria-label]"
)
# 查找所有 展开 按钮,循环点击后在查找内容
elements_expand_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]//div[@role='button' and text()='展开']"
)
for
item
in
elements_expand_list
:
item
.
click
()
# time.sleep(2)
# 内容
element_content_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@role='feed']/div//div[@aria-describedby]/div/div/div/div/div/div[2]/div/div/div[3]"
)
# print(element_content_list)
length
=
len
(
elements
)
# print(length)
for
index
in
range
(
length
):
author
=
element_authors_list
[
index
]
.
text
# el = element_release_list[index]
# # datetime_el = el.get_attribute("datetime")
# html = el.text
# 去除时间字符串中包含的html标签
# BeautifulSoup(element_release_list[index].get_attribute("innerHTML"),"html.parser").get_text()
release_time
=
str
(
int
(
parse_time_string
(
element_release_list
[
index
]
.
text
)))
content
=
element_content_list
[
index
]
.
get_attribute
(
"innerHTML"
)
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
element_content_list
[
index
]
.
get_attribute
(
'innerHTML'
),
'html.parser'
)
# ---------------- 判断类型 start ----------
# 类型
content_type
=
""
try
:
# 查找所有img标签
img_tags
=
soup
.
find_all
(
'img'
)
if
len
(
img_tags
)
>
0
:
content_type
=
"图文"
else
:
content_type
=
"文字"
except
:
content_type
=
"文字"
# ---------------- 判断类型 end ----------
# --------------- 组装数据 start---------------------
obj
=
{
"title"
:
""
,
"content"
:
content
,
"link"
:
element_release_list
[
index
]
.
get_attribute
(
"href"
),
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"author"
:
author
,
"releaseTime"
:
release_time
}
# --------------- 组装数据 end---------------------
data
.
append
(
obj
)
# print(content)
# 内容过滤
# 使用BeautifulSoup解析HTML
# soup = BeautifulSoup(content.get_attribute("innerHTML"), 'html.parser')
# print(soup)
# print("-----")
# print("---------------")
# print(input_email_element)
# print(input_pwd_element)
# print(button_login)
# logger.debug(button)
# 模拟点击按钮多次加载更多数据
# while button.is_enabled():
# time.sleep(2) # 等待一段时间,确保页面加载完毕
# try:
# button.click()
# button = WebDriverWait(browser, 5).until(
# EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='search-show-more-button']")))
# except:
# break
# time.sleep(3)
# 获取完整的分页数据
# page_content = browser.page_source
# soup = BeautifulSoup(page_content, 'html.parser')
# print("----------")
# print(soup)
# list_news = soup.find_all('li', {"class": "css-1l4w6pd"})
# for index, item in enumerate(list_news):
# logger.debug(item)
# # 抓取图片
# image_key = image_key + 1
# url_element = item.find('img', {"class": "css-rq4mmj"})
# image_url = url_element['src'] if url_element else ""
# # logger.debug(url)
# if image_url:
# # logger.debug(url)
# # # 下载图片
# #
# filename = f"{image_key}.jpg"
# # logger.debug(filename)
# # sys.exit()
# download_image(image_url, f'{fileDir}images/{filename}')
# # 抓取文字
# title_element = item.find('h4', {"class": "css-2fgx4k"})
# introduction_element = item.find('p', {"class": "css-16nhkrn"})
# title = title_element.get_text() if title_element else ""
# introduction = introduction_element.get_text() if introduction_element else ""
# news = {
# "title": title,
# "introduction": introduction,
# "imageName": filename
# }
# data.append(news)
# logger.debug(data)
# 将数据保存到文件中
# with open(f'{fileDir}data.json', "w", encoding="utf-8") as file:
# json.dump(data, file, indent=2, ensure_ascii=False)
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if
len
(
data
)
>
0
:
# 保存json文件到本地
log
.
debug
(
os
.
path
.
abspath
(
"../"
))
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "facebook", str(int(time.time())) + ".json")}'
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save
=
save_json
(
file_dir
,
data
)
# log.debug("-----------------------------")
# write_to_database(data)
if
state_save
:
log
.
debug
(
'save file success'
)
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
# log.debug('file_path:' + file_dir)
# form_data = {
# "path": file_dir,
# "tableName": table_name
# }
# response = importJsonPath(form_data)
else
:
log
.
debug
(
'save file failed'
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
# form_data = {
# "path": "",
# "tableName": table_name
# }
# response = importJsonPath(form_data)
browser
.
quit
()
def
main
():
"""
"""
# 请求关键词
response
=
getReptileTask
()
global
status_task
# print(response)
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
search_word
=
""
for
item
in
response
[
'data'
][
'rows'
]:
if
item
[
'name'
]
==
'ptt'
:
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
status_task
=
item
[
"status"
]
# 简体转繁体
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
else
:
log
.
debug
(
"爬取任务未启用"
)
else
:
log
.
debug
(
"call failed"
)
# 请求超时
reptile
(
None
,
convert_to_traditional
(
"新闻"
))
# upload_control()
# 全局变量
data
=
[]
table_name
=
"pms_facebook"
# 调用main函数
main
()
pc_ptt.py
View file @
023912ec
...
@@ -150,9 +150,6 @@ def reptile(browser=None, search_word=""):
...
@@ -150,9 +150,6 @@ def reptile(browser=None, search_word=""):
}
}
# --------------- 组装数据 end---------------------
# --------------- 组装数据 end---------------------
if
search_word
is
None
or
search_word
==
str
(
search_word
):
data
.
append
(
obj
)
else
:
# 使用正则表达式进行匹配
# 使用正则表达式进行匹配
# log.debug(f"关键词:{search_word}-{element_title.text}")
# log.debug(f"关键词:{search_word}-{element_title.text}")
matches
=
re
.
findall
(
search_word
,
element_title
.
text
)
matches
=
re
.
findall
(
search_word
,
element_title
.
text
)
...
@@ -225,13 +222,14 @@ def main():
...
@@ -225,13 +222,14 @@ def main():
table_name
=
item
[
'tableName'
]
table_name
=
item
[
'tableName'
]
status_task
=
item
[
"status"
]
status_task
=
item
[
"status"
]
# 简体转繁体
# 简体转繁体
if
status_task
==
0
:
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
reptile
(
None
,
convert_to_traditional
(
search_word
))
else
:
else
:
log
.
debug
(
"爬取任务未启用"
)
log
.
debug
(
"爬取任务未启用"
)
else
:
else
:
log
.
debug
(
"call failed"
)
log
.
debug
(
"call failed"
)
reptile
(
None
,
''
)
# 请求超时
reptile
(
None
,
convert_to_traditional
(
"新闻"
))
# upload_control()
# upload_control()
...
...
pc_twitter.py
View file @
023912ec
...
@@ -5,7 +5,7 @@ from utils.Logger import log
...
@@ -5,7 +5,7 @@ from utils.Logger import log
from
utils.createBrowserDriver
import
create
from
utils.createBrowserDriver
import
create
from
utils.filse
import
save_json
from
utils.filse
import
save_json
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
api.index
import
importJson
,
getReptileTask
,
importJsonPath
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
from
utils.index
import
convert_to_traditional
,
yt_dlp_download
,
convert_string_to_time
# from pytube import YouTube
# from pytube import YouTube
import
os
import
os
from
config.settings
import
get_base_file_url
from
config.settings
import
get_base_file_url
...
@@ -20,14 +20,14 @@ def reptile(browser=None, search_word=""):
...
@@ -20,14 +20,14 @@ def reptile(browser=None, search_word=""):
url
=
"https://twitter.com/"
url
=
"https://twitter.com/"
option
=
[
'--headless'
]
option
=
[
'--headless'
]
# ['--headless']
# ['--headless']
browser
=
browser
or
create
()
browser
=
browser
or
create
(
option
,
False
)
# print(browser)
# print(browser)
# 打开网页
# 打开网页
browser
.
get
(
url
)
browser
.
get
(
url
)
time
.
sleep
(
3
)
time
.
sleep
(
3
)
try
:
try
:
# 检测是否要登录
# 检测是否要登录
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='username']"
)
login_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='username']"
)
login_input
.
send_keys
(
"liyang1851603"
)
login_input
.
send_keys
(
"liyang1851603"
)
# 获取下一步按钮
# 获取下一步按钮
buttons
=
browser
.
find_element
(
'xpath'
,
"//div[@role='button'][2]"
)
buttons
=
browser
.
find_element
(
'xpath'
,
"//div[@role='button'][2]"
)
...
@@ -36,40 +36,92 @@ def reptile(browser=None, search_word=""):
...
@@ -36,40 +36,92 @@ def reptile(browser=None, search_word=""):
password_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='current-password']"
)
password_input
=
browser
.
find_element
(
'xpath'
,
"//input[@autocomplete='current-password']"
)
password_input
.
send_keys
(
"liyang19970814"
)
password_input
.
send_keys
(
"liyang19970814"
)
# 获取登录按钮
# 获取登录按钮
button_login
=
browser
.
find_element
(
'xpath'
,
"//div[@data-testid='LoginForm_Login_Button']"
)
button_login
=
browser
.
find_element
(
'xpath'
,
"//div[@data-testid='LoginForm_Login_Button']"
)
button_login
.
click
()
button_login
.
click
()
except
:
except
:
print
(
"------"
)
print
(
"------"
)
# print(333333)
time
.
sleep
(
2
)
# time.sleep(3)
url
=
'https://twitter.com/search?q='
+
search_word
+
'&src=typed_query'
browser
.
get
(
url
)
time
.
sleep
(
3
)
# 内容块
element_content_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
)
# 作者
# 作者
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
element_authors_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']//a[@role='link']//div[@dir='ltr']"
)
"//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']/div[1]//a[@role='link']"
)
print
(
element_authors_list
)
print
(
"2222"
)
# 发布时间
# 发布时间
element_release_list
=
browser
.
find_elements
(
'xpath'
,
element_release_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']//div[2]//time[@datetime]"
)
"//div[@data-testid='cellInnerDiv']//article//div[@data-testid='User-Name']//div[2]//time[@datetime]"
)
# 标题
# element_title_list = browser.find_element('xpath',)
# 内容
element_content_list
=
browser
.
find_elements
(
'xpath'
,
"//div[@data-testid='cellInnerDiv']//article/div/div/div[2]/div[2]"
)
# print(element_content_list)
# print(element_content_list)
length
=
len
(
element_authors_list
)
length
=
len
(
element_authors_list
)
print
(
length
)
for
index
in
range
(
length
):
for
index
in
range
(
length
):
author
=
element_authors_list
[
index
]
.
text
author
=
element_authors_list
[
index
]
.
text
release_time
=
element_release_list
[
index
]
.
get_attribute
(
"datetime"
)
release_time
=
str
(
int
(
parse_twitter_time_string
(
element_release_list
[
index
]
.
text
))
)
content
=
element_content_list
[
index
]
content
=
element_content_list
[
index
]
.
get_attribute
(
"innerHTML"
)
print
(
content
)
#
print(content)
# 内容过滤
# 内容过滤
# 使用BeautifulSoup解析HTML
# 使用BeautifulSoup解析HTML
soup
=
BeautifulSoup
(
content
.
get_attribute
(
"innerHTML"
),
'html.parser'
)
soup
=
BeautifulSoup
(
content
,
'html.parser'
)
print
(
soup
)
# ---------------- 判断类型 start ----------
print
(
"-----"
)
# 类型
# browser.close()
content_type
=
""
# # 关闭浏览器驱动
try
:
# browser.quit()
# 查找所有img标签
img_tags
=
soup
.
find_all
(
'img'
)
if
len
(
img_tags
)
>
0
:
content_type
=
"图文"
else
:
content_type
=
"文字"
except
:
content_type
=
"文字"
# ---------------- 判断类型 end ----------
# --------------- 组装数据 start---------------------
obj
=
{
"title"
:
""
,
"content"
:
content
,
"link"
:
""
,
"reptileTime"
:
str
(
int
(
time
.
time
())),
"type"
:
content_type
,
"author"
:
author
,
"releaseTime"
:
release_time
}
# --------------- 组装数据 end---------------------
# 发送爬取数据到java服务
# print('----------------------')
# print(data)
if
len
(
data
)
>
0
:
# 保存json文件到本地
log
.
debug
(
os
.
path
.
abspath
(
"../"
))
file_dir
=
f
'{os.path.join(os.path.abspath("../"), "network-assets-reptile", "reptile_data", "twitter", str(int(time.time())) + ".json")}'
# file_dir = f'./reptile_data/ptt/{int(time.time())}.json'
state_save
=
save_json
(
file_dir
,
data
)
# log.debug("-----------------------------")
# write_to_database(data)
if
state_save
:
log
.
debug
(
'save file success'
)
# path = os.path.abspath(file_dir).join(file_dir).join(".json")
# log.debug('file_path:' + file_dir)
# form_data = {
# "path": file_dir,
# "tableName": table_name
# }
# response = importJsonPath(form_data)
else
:
log
.
debug
(
'save file failed'
)
else
:
# 爬取数据为空
log
.
info
(
"未爬取到数据"
)
# form_data = {
# "path": "",
# "tableName": table_name
# }
# response = importJsonPath(form_data)
# 关闭浏览器驱动
# time.sleep(3)
browser
.
quit
()
def
main
():
def
main
():
...
@@ -78,23 +130,32 @@ def main():
...
@@ -78,23 +130,32 @@ def main():
"""
"""
# 请求关键词
# 请求关键词
response
=
getReptileTask
()
response
=
getReptileTask
()
global
status_task
# print(response)
# print(response)
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
log
.
debug
(
"call success"
)
search_word
=
""
search_word
=
""
for
item
in
response
[
'data'
][
'rows'
]:
for
item
in
response
[
'data'
][
'rows'
]:
if
item
[
'name'
]
==
'
twitter
'
:
if
item
[
'name'
]
==
'
ptt
'
:
search_word
=
item
[
'keyword'
]
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
table_name
=
item
[
'tableName'
]
status_task
=
item
[
"status"
]
# 简体转繁体
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
reptile
(
None
,
convert_to_traditional
(
search_word
))
else
:
log
.
debug
(
"爬取任务未启用"
)
else
:
else
:
log
.
debug
(
"call failed"
)
log
.
debug
(
"call failed"
)
reptile
(
None
,
''
)
# 请求超时
reptile
(
None
,
convert_to_traditional
(
"新闻"
))
# upload_control()
# upload_control()
# 全局变量
# 全局变量
data
=
[]
data
=
[]
table_name
=
"pms_twitter"
table_name
=
"pms_twitter"
# 是否启用
status_task
=
'0'
# 调用main函数
# 调用main函数
main
()
main
()
pc_youtube.py
View file @
023912ec
...
@@ -61,9 +61,9 @@ def reptile(browser=None, search_word=""):
...
@@ -61,9 +61,9 @@ def reptile(browser=None, search_word=""):
base_urr
=
get_base_file_url
()
base_urr
=
get_base_file_url
()
releaseTime
=
""
releaseTime
=
""
try
:
try
:
releaseTime
=
str
(
convert_string_to_time
(
element_time_list
[
index
]
.
text
))
releaseTime
=
str
(
int
(
convert_string_to_time
(
element_time_list
[
index
]
.
text
)
))
except
:
except
:
releaseTime
=
str
(
time
.
time
(
))
releaseTime
=
str
(
int
(
time
.
time
()
))
# 下载视频
# 下载视频
state_download
=
yt_dlp_download
(
url
,
'youtube'
)
state_download
=
yt_dlp_download
(
url
,
'youtube'
)
log
.
debug
(
url
)
log
.
debug
(
url
)
...
@@ -124,23 +124,25 @@ def main():
...
@@ -124,23 +124,25 @@ def main():
"""
"""
# 请求关键词
# 请求关键词
response
=
getReptileTask
()
response
=
getReptileTask
()
# print(response)
global
status_task
global
status_task
# print(response)
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
if
response
[
'status_code'
]
==
200
and
response
[
'data'
][
'code'
]
==
200
:
log
.
debug
(
"call success"
)
log
.
debug
(
"call success"
)
search_word
=
""
search_word
=
""
for
item
in
response
[
'data'
][
'rows'
]:
for
item
in
response
[
'data'
][
'rows'
]:
if
item
[
'name'
]
==
'
youtube
'
:
if
item
[
'name'
]
==
'
ptt
'
:
search_word
=
item
[
'keyword'
]
search_word
=
item
[
'keyword'
]
table_name
=
item
[
'tableName'
]
table_name
=
item
[
'tableName'
]
status_task
=
item
[
"status"
]
status_task
=
item
[
"status"
]
if
status_task
==
0
:
# 简体转繁体
if
status_task
==
0
and
len
(
search_word
)
>
0
:
reptile
(
None
,
convert_to_traditional
(
search_word
))
reptile
(
None
,
convert_to_traditional
(
search_word
))
else
:
else
:
log
.
debug
(
"爬取任务未启用"
)
log
.
debug
(
"爬取任务未启用"
)
else
:
else
:
log
.
debug
(
"call failed"
)
log
.
debug
(
"call failed"
)
reptile
(
None
,
''
)
# 请求超时
reptile
(
None
,
convert_to_traditional
(
"新闻"
))
# upload_control()
# upload_control()
...
...
utils/createBrowserDriver.py
View file @
023912ec
...
@@ -12,9 +12,10 @@ from selenium.webdriver.support.ui import WebDriverWait
...
@@ -12,9 +12,10 @@ from selenium.webdriver.support.ui import WebDriverWait
'''
'''
def
create
(
option
=
None
):
def
create
(
option
=
None
,
using_user_data
=
True
):
"""
"""
:param using_user_data:
:param option:
:param option:
:return:
:return:
"""
"""
...
@@ -28,6 +29,7 @@ def create(option=None):
...
@@ -28,6 +29,7 @@ def create(option=None):
script
=
f
'--user-data-dir={user_data_dir}'
script
=
f
'--user-data-dir={user_data_dir}'
# print(script)
# print(script)
# log.debug(script)
# log.debug(script)
if
using_user_data
:
chrome_options
.
add_argument
(
script
)
# 设置一个自定义的用户配置文件路径
chrome_options
.
add_argument
(
script
)
# 设置一个自定义的用户配置文件路径
if
sys
.
platform
.
startswith
(
'linux'
):
if
sys
.
platform
.
startswith
(
'linux'
):
...
...
utils/index.py
View file @
023912ec
...
@@ -5,12 +5,46 @@ from pytube import YouTube
...
@@ -5,12 +5,46 @@ from pytube import YouTube
import
ssl
import
ssl
import
subprocess
import
subprocess
from
utils.Logger
import
log
from
utils.Logger
import
log
from
datetime
import
datetime
,
timedelta
import
datetime
from
dateutil
import
parser
# from dateutil.relativedelta import relativedelta
# from dateutil.relativedelta import relativedelta
ssl
.
_create_default_https_context
=
ssl
.
_create_stdlib_context
ssl
.
_create_default_https_context
=
ssl
.
_create_stdlib_context
# 转换 facebook 的时间
def
parse_time_string
(
time_str
):
"""
:param time_str:
:return:
"""
log
.
debug
(
f
'转换face4book的发布时间:{time_str}'
)
if
"天"
in
time_str
:
number
=
int
(
time_str
.
split
(
"天"
)[
0
])
time_delta
=
datetime
.
timedelta
(
days
=
number
)
return
int
((
datetime
.
datetime
.
now
()
+
time_delta
)
.
timestamp
())
elif
"分钟"
in
time_str
:
number
=
int
(
time_str
.
split
(
"分钟"
)[
0
])
time_delta
=
datetime
.
timedelta
(
minutes
=
number
)
return
int
((
datetime
.
datetime
.
now
()
+
time_delta
)
.
timestamp
())
elif
"小时"
in
time_str
:
number
=
int
(
time_str
.
split
(
"小时"
)[
0
])
time_delta
=
datetime
.
timedelta
(
hours
=
number
)
return
int
((
datetime
.
datetime
.
now
()
+
time_delta
)
.
timestamp
())
else
:
try
:
datetime_str
=
time_str
.
replace
(
"月"
,
" "
)
.
replace
(
"日"
,
""
)
month
,
day
,
hour
,
minute
=
map
(
int
,
datetime_str
.
split
())
current_year
=
datetime
.
datetime
.
now
()
.
year
datetime_obj
=
datetime
.
datetime
(
year
=
current_year
,
month
=
month
,
day
=
day
,
hour
=
hour
,
minute
=
minute
)
return
int
(
datetime_obj
.
timestamp
())
except
ValueError
:
return
None
# 转换 youtube 的时间
def
convert_string_to_time
(
string
):
def
convert_string_to_time
(
string
):
"""
"""
...
@@ -38,6 +72,18 @@ def convert_string_to_time(string):
...
@@ -38,6 +72,18 @@ def convert_string_to_time(string):
timestamp
=
int
(
converted_time
.
timestamp
())
timestamp
=
int
(
converted_time
.
timestamp
())
return
timestamp
return
timestamp
# 转换 twitter 的时间
def
parse_twitter_time_string
(
time_str
):
"""
:param time_str:
:return:
"""
# 解析相对时间字符串
return
datetime
.
timestamp
(
parser
.
parse
(
time_str
,
fuzzy
=
True
))
def
convert_to_traditional
(
simplified_text
):
def
convert_to_traditional
(
simplified_text
):
"""
"""
将简体中文文本转换为繁体中文文本。
将简体中文文本转换为繁体中文文本。
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment