aiqicha
This commit is contained in:
86
company/qcc_crawler.py
Normal file
86
company/qcc_crawler.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
COOKIE_FILE = "qcc_cookies.json"
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_cookie_file():
|
||||||
|
if not os.path.exists(COOKIE_FILE):
|
||||||
|
with open(COOKIE_FILE, 'w') as f:
|
||||||
|
json.dump([], f)
|
||||||
|
print(f"已创建新的cookie文件: {COOKIE_FILE}")
|
||||||
|
|
||||||
|
|
||||||
|
def save_cookies(context):
|
||||||
|
cookies = context.cookies()
|
||||||
|
with open(COOKIE_FILE, 'w') as f:
|
||||||
|
json.dump(cookies, f, indent=2)
|
||||||
|
print(f"Cookies已保存到 {COOKIE_FILE}")
|
||||||
|
|
||||||
|
|
||||||
|
def load_cookies(context):
|
||||||
|
try:
|
||||||
|
with open(COOKIE_FILE, 'r') as f:
|
||||||
|
cookies = json.load(f)
|
||||||
|
if cookies:
|
||||||
|
context.add_cookies(cookies)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"加载cookies失败: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def qcc_login(page):
|
||||||
|
page.goto("https://www.qcc.com")
|
||||||
|
page.wait_for_selector(".login-container", timeout=5000)
|
||||||
|
page.click("text=扫码登录")
|
||||||
|
|
||||||
|
print("请扫描页面二维码登录...")
|
||||||
|
page.wait_for_url("**/usercenter**", timeout=120000)
|
||||||
|
print("登录成功!")
|
||||||
|
|
||||||
|
|
||||||
|
def search_company(page, company_name):
|
||||||
|
page.goto(f"https://www.qcc.com/web/search?key={company_name}")
|
||||||
|
page.wait_for_selector(".search-result-item", timeout=10000)
|
||||||
|
|
||||||
|
# 示例数据提取
|
||||||
|
company_info = {
|
||||||
|
"name": page.locator(".company-name").first.inner_text(),
|
||||||
|
"legal_rep": page.locator(".legal-person").first.inner_text(),
|
||||||
|
"status": page.locator(".company-status").first.inner_text()
|
||||||
|
}
|
||||||
|
return company_info
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ensure_cookie_file()
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=False)
|
||||||
|
context = browser.new_context()
|
||||||
|
page = context.new_page()
|
||||||
|
|
||||||
|
if not load_cookies(context):
|
||||||
|
qcc_login(page)
|
||||||
|
save_cookies(context)
|
||||||
|
|
||||||
|
companies = ["阿里巴巴", "腾讯科技", "华为技术"]
|
||||||
|
for company in companies:
|
||||||
|
try:
|
||||||
|
info = search_company(page, company)
|
||||||
|
print(f"{info['name']} | 法人:{info['legal_rep']} | 状态:{info['status']}")
|
||||||
|
save_cookies(context) # 每次操作后更新cookies
|
||||||
|
time.sleep(3)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"查询 {company} 失败: {str(e)}")
|
||||||
|
|
||||||
|
context.close()
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user