From 54f3beded998d66dd6b9df494a6de055f4e21a3c Mon Sep 17 00:00:00 2001 From: manchuwork Date: Thu, 25 Sep 2025 03:19:50 +0800 Subject: [PATCH] aiqicha --- company/qcc_crawler.py | 86 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 company/qcc_crawler.py diff --git a/company/qcc_crawler.py b/company/qcc_crawler.py new file mode 100644 index 0000000..fd5a274 --- /dev/null +++ b/company/qcc_crawler.py @@ -0,0 +1,86 @@ +from playwright.sync_api import sync_playwright +import json +import os +import time + +COOKIE_FILE = "qcc_cookies.json" + + +def ensure_cookie_file(): + if not os.path.exists(COOKIE_FILE): + with open(COOKIE_FILE, 'w') as f: + json.dump([], f) + print(f"已创建新的cookie文件: {COOKIE_FILE}") + + +def save_cookies(context): + cookies = context.cookies() + with open(COOKIE_FILE, 'w') as f: + json.dump(cookies, f, indent=2) + print(f"Cookies已保存到 {COOKIE_FILE}") + + +def load_cookies(context): + try: + with open(COOKIE_FILE, 'r') as f: + cookies = json.load(f) + if cookies: + context.add_cookies(cookies) + return True + return False + except Exception as e: + print(f"加载cookies失败: {str(e)}") + return False + + +def qcc_login(page): + page.goto("https://www.qcc.com") + page.wait_for_selector(".login-container", timeout=5000) + page.click("text=扫码登录") + + print("请扫描页面二维码登录...") + page.wait_for_url("**/usercenter**", timeout=120000) + print("登录成功!") + + +def search_company(page, company_name): + page.goto(f"https://www.qcc.com/web/search?key={company_name}") + page.wait_for_selector(".search-result-item", timeout=10000) + + # 示例数据提取 + company_info = { + "name": page.locator(".company-name").first.inner_text(), + "legal_rep": page.locator(".legal-person").first.inner_text(), + "status": page.locator(".company-status").first.inner_text() + } + return company_info + + +def main(): + ensure_cookie_file() + + with sync_playwright() as p: + browser = p.chromium.launch(headless=False) + context = browser.new_context() + page = context.new_page() + + if not load_cookies(context): + qcc_login(page) + save_cookies(context) + + companies = ["阿里巴巴", "腾讯科技", "华为技术"] + for company in companies: + try: + info = search_company(page, company) + print(f"{info['name']} | 法人:{info['legal_rep']} | 状态:{info['status']}") + save_cookies(context) # 每次操作后更新cookies + time.sleep(3) + except Exception as e: + print(f"查询 {company} 失败: {str(e)}") + + context.close() + browser.close() + + +if __name__ == "__main__": + main()