This commit is contained in:
manchuwork
2025-09-25 03:19:50 +08:00
parent 102dd78c26
commit 54f3beded9

86
company/qcc_crawler.py Normal file
View File

@@ -0,0 +1,86 @@
from playwright.sync_api import sync_playwright
import json
import os
import time
COOKIE_FILE = "qcc_cookies.json"
def ensure_cookie_file():
if not os.path.exists(COOKIE_FILE):
with open(COOKIE_FILE, 'w') as f:
json.dump([], f)
print(f"已创建新的cookie文件: {COOKIE_FILE}")
def save_cookies(context):
cookies = context.cookies()
with open(COOKIE_FILE, 'w') as f:
json.dump(cookies, f, indent=2)
print(f"Cookies已保存到 {COOKIE_FILE}")
def load_cookies(context):
try:
with open(COOKIE_FILE, 'r') as f:
cookies = json.load(f)
if cookies:
context.add_cookies(cookies)
return True
return False
except Exception as e:
print(f"加载cookies失败: {str(e)}")
return False
def qcc_login(page):
page.goto("https://www.qcc.com")
page.wait_for_selector(".login-container", timeout=5000)
page.click("text=扫码登录")
print("请扫描页面二维码登录...")
page.wait_for_url("**/usercenter**", timeout=120000)
print("登录成功!")
def search_company(page, company_name):
page.goto(f"https://www.qcc.com/web/search?key={company_name}")
page.wait_for_selector(".search-result-item", timeout=10000)
# 示例数据提取
company_info = {
"name": page.locator(".company-name").first.inner_text(),
"legal_rep": page.locator(".legal-person").first.inner_text(),
"status": page.locator(".company-status").first.inner_text()
}
return company_info
def main():
ensure_cookie_file()
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
if not load_cookies(context):
qcc_login(page)
save_cookies(context)
companies = ["阿里巴巴", "腾讯科技", "华为技术"]
for company in companies:
try:
info = search_company(page, company)
print(f"{info['name']} | 法人:{info['legal_rep']} | 状态:{info['status']}")
save_cookies(context) # 每次操作后更新cookies
time.sleep(3)
except Exception as e:
print(f"查询 {company} 失败: {str(e)}")
context.close()
browser.close()
if __name__ == "__main__":
main()