Files
SearchCompany/company/aiqicha_crawler.py
manchuwork 102dd78c26 aiqicha
2025-09-25 03:19:34 +08:00

320 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import random
from playwright.sync_api import sync_playwright
import json
import os
import time
COOKIE_PATH = "aiqicha_cookies.json"
class AiqichaCrawler:
def __init__(self):
self.browser = None
self.context = None
self.page = None
def anti_detection(self):
"""注入更全面的反检测脚本"""
self.page.add_init_script("""
// 隐藏webdriver属性
delete navigator.__proto__.webdriver;
// 伪装chrome属性
Object.defineProperty(navigator, 'chrome', {
value: {
runtime: {},
loadTimes: function() {}
},
writable: false,
enumerable: true,
configurable: true
});
// 伪装plugins和mimeTypes
Object.defineProperty(navigator, 'plugins', {
get: () => [
{ 0: { type: 'application/pdf' } },
{ 0: { type: 'application/x-google-chrome-pdf' } }
],
});
Object.defineProperty(navigator, 'mimeTypes', {
get: () => [
{ type: 'application/pdf' },
{ type: 'application/x-google-chrome-pdf' }
],
});
// 伪装languages
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh'],
});
// 禁用调试功能
window.console.debug = function() {};
window.console.log = function() {};
// 伪装屏幕信息
Object.defineProperty(screen, 'width', {get: () => 1366});
Object.defineProperty(screen, 'height', {get: () => 768});
Object.defineProperty(screen, 'availWidth', {get: () => 1366});
Object.defineProperty(screen, 'availHeight', {get: () => 768});
Object.defineProperty(screen, 'colorDepth', {get: () => 24});
Object.defineProperty(screen, 'pixelDepth', {get: () => 24});
// 伪装时间戳
window.chrome = {
runtime: {}
};
// 伪装outerHeight和outerWidth
Object.defineProperty(window, 'outerHeight', {get: () => 768});
Object.defineProperty(window, 'outerWidth', {get: () => 1366});
// 伪装innerHeight和innerWidth
Object.defineProperty(window, 'innerHeight', {get: () => 768});
Object.defineProperty(window, 'innerWidth', {get: () => 1366});
""")
def random_behavior(self):
"""模拟更复杂的人类操作"""
# 随机等待
time.sleep(random.uniform(2, 5))
# 随机鼠标移动
for _ in range(random.randint(3, 7)):
self.page.mouse.move(
random.randint(100, 1200),
random.randint(100, 600)
)
time.sleep(random.uniform(0.1, 0.8))
# 随机滚动页面
if random.choice([True, False]):
scroll_distance = random.randint(200, 800)
self.page.mouse.wheel(0, scroll_distance)
time.sleep(random.uniform(1, 2))
def init_cookie_file(self):
if not os.path.exists(COOKIE_PATH):
with open(COOKIE_PATH, 'w') as f:
json.dump([], f)
def save_cookies(self):
cookies = self.context.cookies()
with open(COOKIE_PATH, 'w') as f:
json.dump(cookies, f, indent=2)
def load_cookies(self):
try:
with open(COOKIE_PATH, 'r') as f:
cookies = json.load(f)
if cookies:
self.context.add_cookies(cookies)
return True
return False
except:
return False
def bypass_debugger(self):
self.page.add_init_script("""
window.Function.prototype.constructor = function() {};
window.console.debug = function(){};
Object.defineProperty(navigator, 'webdriver', {get: () => false});
""")
def check_login_status(self):
"""检测登录状态返回True表示已登录"""
try:
# 先关闭可能的功能上新弹窗
self.close_feature_popup()
# 等待页面加载完成
self.page.wait_for_load_state("networkidle")
# 优先检查 .header-user-center-menu 元素判断是否已登录
logged_in_elements = self.page.query_selector_all('.header-user-center-menu, .user-center')
for element in logged_in_elements:
if element and element.is_visible():
print("检测到已登录状态")
return True
# 检测用户中心元素判断已登录
user_center = self.page.query_selector('.user-center')
if user_center and user_center.is_visible():
print("检测到已登录状态")
return True
# 检测登录相关元素
#self.page.wait_for_selector('.ivu-tooltip-light', timeout=10000)
#self.page.wait_for_selector('img[src*="app-qrcode.png"]', timeout=20000)
#print("检测到未登录状态")
#return False
# 检测登录相关元素
login_element = self.page.query_selector('.login')
if login_element and login_element.is_visible():
print("检测到未登录状态")
return False
except:
try:
# 检测用户中心元素判断已登录
self.page.wait_for_selector('.user-center', timeout=3000)
print("检测到已登录状态")
return True
except:
print("登录状态检测异常")
return False
def close_feature_popup(self):
"""关闭功能上新弹窗"""
try:
# 查找并点击关闭按钮
close_buttons = self.page.query_selector_all('.close-icon.ivu-icon-ios-close')
for close_button in close_buttons:
if close_button.is_visible():
close_button.click()
print("已关闭功能上新弹窗")
# 等待弹窗消失
time.sleep(1)
break
except Exception as e:
# 如果没有找到弹窗,继续执行
pass
def login(self):
"""带状态检测的登录流程"""
self.page.goto("https://aiqicha.baidu.com")
# 页面加载后执行反检测
self.page.evaluate("""
delete navigator.__proto__.webdriver;
""")
# 等待页面加载完成
self.page.wait_for_load_state("networkidle")
# 关闭可能的功能上新弹窗
self.close_feature_popup()
if not self.check_login_status():
print("开始执行登录流程...")
# 点击登录按钮
login_btn = self.page.wait_for_selector('.login', timeout=20000)
login_btn.click()
# try:
# 等待二维码容器出现并确保可见
# self.page.wait_for_selector('.app-qrcode', timeout=20000)
print("请扫描页面二维码登录...")
time.sleep(3) # 给一些时间让二维码完全加载
# 等待登录完成
# 等待登录完成先尝试URL检测失败后再尝试元素检测
# try:
# self.page.wait_for_url("https://aiqicha.baidu.com/usercenter/**", timeout=5000)
# except:
# # 如果URL检测失败尝试通过元素检测
# self.page.wait_for_selector('.header-user-center-menu, .user-center', timeout=10000)
# 如果URL检测失败尝试通过元素检测
self.page.wait_for_selector('.header-user-center-menu, .user-center', timeout=10000)
# self.page.wait_for_url("**/usercenter**", timeout=120000)
self.save_cookies()
print("登录成功!")
def search_company(self, company_name):
self.page.goto(f"https://aiqicha.baidu.com/s?q={company_name}")
# 页面加载后执行反检测
self.page.evaluate("""
delete navigator.__proto__.webdriver;
""")
# 关闭可能的功能上新弹窗
self.close_feature_popup()
self.page.wait_for_selector(".search-item", timeout=10000)
# 提取企业基础信息
company_card = self.page.query_selector(".search-item")
return {
"name": company_card.query_selector(".company-name").inner_text(),
"legal_person": company_card.query_selector(".legal-person").inner_text(),
"reg_capital": company_card.query_selector(".reg-capital").inner_text(),
"status": company_card.query_selector(".company-status").inner_text()
}
def run(self, companies):
self.init_cookie_file()
with sync_playwright() as p:
# self.browser = p.chromium.launch(headless=False)
self.browser = p.chromium.launch(
headless=False,
args=[
"--disable-blink-features=AutomationControlled",
"--disable-infobars",
"--disable-extensions",
"--disable-plugins",
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-web-security",
"--disable-features=IsolateOrigins,site-per-process",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-renderer-backgrounding",
"--disable-ipc-flooding-protection"
]
)
# self.context = self.browser.new_context()
self.context = self.browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
viewport={"width": 1366, "height": 768},
device_scale_factor=1,
is_mobile=False,
has_touch=False,
locale="zh-CN",
timezone_id="Asia/Shanghai"
)
self.page = self.context.new_page()
self.anti_detection()
# 立即执行一次反检测
self.page.evaluate("""
delete navigator.__proto__.webdriver;
""")
self.random_behavior()
if not self.load_cookies():
print("未找到有效Cookie开始登录流程...")
self.login()
else:
print("已加载Cookie验证登录状态...")
# 加载cookie后访问页面验证是否真正登录
self.page.goto("https://aiqicha.baidu.com")
# 等待页面加载完成
self.page.wait_for_load_state("networkidle")
# 验证登录状态
if not self.check_login_status():
print("Cookie已过期或无效重新登录...")
self.login()
else:
print("Cookie有效已登录")
for company in companies:
try:
data = self.search_company(company)
print(f"{data['name']} | 法人:{data['legal_person']} | 注册资本:{data['reg_capital']}")
self.save_cookies() # 每次操作后更新cookies
time.sleep(3) # 防止请求过快
except Exception as e:
print(f"查询 {company} 失败: {str(e)}")
self.context.close()
self.browser.close()
if __name__ == "__main__":
crawler = AiqichaCrawler()
companies = ["阿里巴巴", "腾讯科技", "华为技术"]
crawler.run(companies)