Commit 1526bcd6 authored by liyang's avatar liyang

feat:爬取instagram

parent 2fe822c2
...@@ -13,8 +13,5 @@ reptile_data/**/*.json ...@@ -13,8 +13,5 @@ reptile_data/**/*.json
*.mp4 *.mp4
*.webm *.webm
*.jpg *.jpg
*.app
*.exe
*.deb
browser/*chrome* browser/*chrome*
browser/**/chromedriver browser/**/chromedriver
\ No newline at end of file
...@@ -2,8 +2,26 @@ ...@@ -2,8 +2,26 @@
def get_log_path(): def get_log_path():
return "../" return "../"
def get_base_url(): def get_base_url():
return "http://192.168.0.118:8081/" return "http://192.168.0.118:8081/"
def get_base_file_url(): def get_base_file_url():
return "http://192.168.0.118:8186/" return "http://192.168.0.118:8186/"
\ No newline at end of file
def get_account(name):
data = {}
if name == "twitter":
data["name"] = "liyang1851603"
data["password"] = "liyang19970814"
elif name == "facebook":
data["name"] = "liyang19970814@gmail.com"
data["password"] = "xn89kiPT/^Kaeg#"
elif name == "instagram":
data["name"] = "anthonymills7693"
data["password"] = "unm8rgoab52"
else:
print("")
return data
...@@ -11,6 +11,7 @@ from datetime import datetime ...@@ -11,6 +11,7 @@ from datetime import datetime
from utils.download_image import download_image from utils.download_image import download_image
import os import os
from config.settings import get_base_file_url from config.settings import get_base_file_url
from config.settings import get_account
import sys import sys
# 工具函数-下载图片 # 工具函数-下载图片
''' '''
...@@ -29,8 +30,8 @@ def reptile(browser=None, search_word=""): ...@@ -29,8 +30,8 @@ def reptile(browser=None, search_word=""):
# 检测是否要登录 # 检测是否要登录
login_input = browser.find_element('xpath', "//input[@name='email']") login_input = browser.find_element('xpath', "//input[@name='email']")
password_input = browser.find_element('xpath', "//input[@name='pass']") password_input = browser.find_element('xpath', "//input[@name='pass']")
login_input.send_keys("liyang19970814@gmail.com") login_input.send_keys(get_account("facebook")["name"])
password_input.send_keys("xn89kiPT/^Kaeg#") password_input.send_keys(get_account("facebook")["password"])
# 获取登录按钮 # 获取登录按钮
button_login = browser.find_element('xpath', "//button[@name='login']") button_login = browser.find_element('xpath', "//button[@name='login']")
button_login.click() button_login.click()
......
This diff is collapsed.
...@@ -12,7 +12,7 @@ import sys ...@@ -12,7 +12,7 @@ import sys
from datetime import datetime from datetime import datetime
from utils.download_image import download_image from utils.download_image import download_image
from config.settings import get_base_file_url from config.settings import get_base_file_url
from config.settings import get_account
# 工具函数-下载图片 # 工具函数-下载图片
''' '''
打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。 打开指定网页,并使用 Selenium 模拟点击 "GDPR-accept" 按钮,然后循环点击 "search-show-more-button" 按钮来加载更多数据,直到按钮不再可点击为止。最后,获取完整的分页数据并关闭浏览器驱动。
...@@ -37,13 +37,13 @@ def reptile(browser=None, search_word=""): ...@@ -37,13 +37,13 @@ def reptile(browser=None, search_word=""):
try: try:
# 检测是否要登录 # 检测是否要登录
login_input = browser.find_element('xpath', "//input[@autocomplete='username']") login_input = browser.find_element('xpath', "//input[@autocomplete='username']")
login_input.send_keys("liyang1851603") login_input.send_keys(get_account("twitter")["name"])
# 获取下一步按钮 # 获取下一步按钮
buttons = browser.find_element('xpath', "//div[@role='button'][2]") buttons = browser.find_element('xpath', "//div[@role='button'][2]")
buttons.click() buttons.click()
time.sleep(3) time.sleep(3)
password_input = browser.find_element('xpath', "//input[@autocomplete='current-password']") password_input = browser.find_element('xpath', "//input[@autocomplete='current-password']")
password_input.send_keys("liyang19970814") password_input.send_keys(get_account("twitter")["password"])
# # 获取登录按钮 # # 获取登录按钮
button_login = browser.find_element('xpath', "//div[@data-testid='LoginForm_Login_Button']") button_login = browser.find_element('xpath', "//div[@data-testid='LoginForm_Login_Button']")
button_login.click() button_login.click()
......
APScheduler==3.10.1
asgiref==3.7.2
async-generator==1.10
attrs==23.1.0
beautifulsoup4==4.12.2
certifi==2023.5.7
charset-normalizer==3.1.0
Django==4.2.2
docopt==0.6.2
exceptiongroup==1.1.1
h11==0.14.0
idna==3.4
loguru==0.7.0
lxml==4.9.2
outcome==1.2.0
pipreqs==0.4.13
PyMySQL==1.1.0
PySocks==1.7.1
pytube==15.0.0
pytz==2023.3
requests==2.31.0
selenium==4.10.0
six==1.16.0
sniffio==1.3.0
sortedcontainers==2.4.0
soupsieve==2.4.1
sqlparse==0.4.4
trio==0.22.0
trio-websocket==0.10.3
typing_extensions==4.7.0
tzlocal==5.0.1
urllib3==2.0.3
wsproto==1.2.0
yarg==0.1.9
OpenCC~=1.1.1
python-dateutil~=2.8.2
\ No newline at end of file
...@@ -28,7 +28,7 @@ from utils.index import get_screen_resolution ...@@ -28,7 +28,7 @@ from utils.index import get_screen_resolution
''' '''
def create(option=None, using_user_data=True, web_browser="chrome_test"): def create(option=None, using_user_data=True, web_browser="chromium"):
""" """
:param web_browser: :param web_browser:
...@@ -50,7 +50,8 @@ def create(option=None, using_user_data=True, web_browser="chrome_test"): ...@@ -50,7 +50,8 @@ def create(option=None, using_user_data=True, web_browser="chrome_test"):
options = webdriver.ChromeOptions() options = webdriver.ChromeOptions()
elif web_browser == "edge": elif web_browser == "edge":
options = webdriver.EdgeOptions() options = webdriver.EdgeOptions()
elif web_browser == "chromium":
options = webdriver.ChromeOptions()
if option is not None: if option is not None:
for value in option: for value in option:
options.add_argument(value) options.add_argument(value)
...@@ -64,7 +65,7 @@ def create(option=None, using_user_data=True, web_browser="chrome_test"): ...@@ -64,7 +65,7 @@ def create(option=None, using_user_data=True, web_browser="chrome_test"):
# options.add_argument(f'--user-data-dir={user_data_dir}') # options.add_argument(f'--user-data-dir={user_data_dir}')
elif web_browser == "chrome": elif web_browser == "chrome":
options.add_argument(f'--user-data-dir={user_data_dir}') options.add_argument(f'--user-data-dir={user_data_dir}')
elif web_browser == "chrome_test": elif web_browser == "chromium":
options.add_argument(f'--user-data-dir={user_data_dir}') options.add_argument(f'--user-data-dir={user_data_dir}')
elif web_browser == "chrome_test": elif web_browser == "chrome_test":
options.add_argument(f'--user-data-dir={user_data_dir}') options.add_argument(f'--user-data-dir={user_data_dir}')
...@@ -92,7 +93,7 @@ def create(option=None, using_user_data=True, web_browser="chrome_test"): ...@@ -92,7 +93,7 @@ def create(option=None, using_user_data=True, web_browser="chrome_test"):
elif web_browser == "chrome": elif web_browser == "chrome":
# 创建Chrome浏览器对象并传入选项 # 创建Chrome浏览器对象并传入选项
web_browser = webdriver.Chrome(options=options, service=ChromeService(ChromeDriverManager().install())) web_browser = webdriver.Chrome(options=options, service=ChromeService(ChromeDriverManager().install()))
elif web_browser == "chrome_test": elif web_browser == "chromium":
binary_location = "" binary_location = ""
webdriver_location = "" webdriver_location = ""
if platform.system() == "Windows": if platform.system() == "Windows":
...@@ -115,7 +116,7 @@ def create(option=None, using_user_data=True, web_browser="chrome_test"): ...@@ -115,7 +116,7 @@ def create(option=None, using_user_data=True, web_browser="chrome_test"):
# 指定浏览器路径 # 指定浏览器路径
# print(binary_location) # print(binary_location)
# 指定浏览器路径 # 指定浏览器路径
options.binary_location = binary_location # options.binary_location = binary_location
# options.browser_version = "114" # options.browser_version = "114"
# 设置驱动二进制可执行文件路径 # 设置驱动二进制可执行文件路径
# service = ChromeService(executable_path=webdriver_location) # service = ChromeService(executable_path=webdriver_location)
......
...@@ -2,7 +2,7 @@ import os.path ...@@ -2,7 +2,7 @@ import os.path
import re import re
import time import time
import opencc from hanziconv import HanziConv
import datetime import datetime
from pytube import YouTube from pytube import YouTube
import ssl import ssl
...@@ -135,8 +135,8 @@ def convert_to_traditional(simplified_text): ...@@ -135,8 +135,8 @@ def convert_to_traditional(simplified_text):
Returns: Returns:
str: 转换后的繁体中文文本。 str: 转换后的繁体中文文本。
""" """
converter = opencc.OpenCC('s2t.json') # 创建简体中文到繁体中文的转换器 # converter = opencc.OpenCC('s2t.json') # 创建简体中文到繁体中文的转换器
traditional_text = converter.convert(simplified_text) # 进行转换 traditional_text = HanziConv.toTraditional(simplified_text) # 进行转换
return traditional_text return traditional_text
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment