Files
SearchCompany/company/qcc.py
manchuwork 9d0f18a121 cookies
2025-09-05 16:51:46 +08:00

344 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# qcc.py
import json
import re
import os
from playwright.sync_api import sync_playwright
import argparse
def clean_text(text):
"""
清理文本内容,去除多余空白字符
"""
if not text:
return ""
# 替换多个空白字符为单个空格,并去除首尾空白
return re.sub(r'\s+', ' ', text.replace('\r', '').replace('\n', '').replace('\t', '')).strip()
class QCCParser:
def __init__(self, page):
self.page = page
self.company_data = {}
def init_table(self):
"""
初始化表格元素
"""
# 等待页面加载完成
self.page.wait_for_load_state('networkidle')
# 查找企业信息容器
cominfo_normal = self.page.query_selector("div.cominfo-normal")
if not cominfo_normal:
print("未找到企业信息容器")
return False
# 查找企业信息表格
self.table = cominfo_normal.query_selector("table.ntable")
if not self.table:
print("未找到企业信息表格")
return False
return True
def get_optimized_value(self, title):
"""
根据标题获取对应的值
"""
# 查找所有表头单元格
header_cells = self.table.query_selector_all("td.tb")
value = None
for header in header_cells:
if clean_text(header.text_content()).find(title) != -1:
value_cell = header.query_selector("+ td")
if value_cell:
# 尝试从copy-value类中获取值
copy_value = value_cell.query_selector(".copy-value")
if copy_value:
value = clean_text(copy_value.text_content())
else:
value = clean_text(value_cell.text_content())
break
return value
def get_legal_representative(self):
"""
获取法定代表人
"""
# 尝试基本方法获取
basic_value = self.get_optimized_value("法定代表人") or self.get_optimized_value("法人")
if basic_value and basic_value.strip():
# 移除"关联企业 X"等附加信息
return re.sub(r'\s*关联企业\s*\d+$', '', basic_value).strip()
# 查找法定代表人表头
header_cell = None
for cell in self.table.query_selector_all("td.tb"):
if clean_text(cell.text_content()).find("法定代表人") != -1:
header_cell = cell
break
if not header_cell:
return None
value_cell = header_cell.query_selector("+ td")
if not value_cell:
return None
# 查找包含法定代表人姓名的链接
name_links = value_cell.query_selector_all('a[target="_blank"]')
for link in name_links:
name = clean_text(link.text_content())
# 确保不是空值且不包含非姓名文本
if name and "关联企业" not in name and "复制" not in name:
return name
# 备选方案:查找第一个链接
first_link = value_cell.query_selector("a")
if first_link:
name = clean_text(first_link.text_content())
return re.sub(r'\s*关联企业\s*\d+$', '', name).strip()
# 最后备选方案:直接提取文本
copy_value = value_cell.query_selector(".copy-value")
if copy_value:
name = clean_text(copy_value.text_content())
return re.sub(r'\s*关联企业\s*\d+$', '', name).strip()
raw_text = clean_text(value_cell.text_content())
return re.sub(r'\s*关联企业\s*\d+$', '', raw_text).strip()
def get_unified_social_credit_code(self):
"""
获取统一社会信用代码
"""
return (self.get_optimized_value("统一社会信用代码") or
self.get_optimized_value("信用代码"))
def get_business_registration_no(self):
"""
获取工商注册号
"""
return (self.get_optimized_value("工商注册号") or
self.get_optimized_value("注册号"))
def get_organization_code(self):
"""
获取组织机构代码
"""
return self.get_optimized_value("组织机构代码")
def get_taxpayer_id(self):
"""
获取纳税人识别号
"""
return (self.get_optimized_value("纳税人识别号") or
self.get_unified_social_credit_code())
def get_insurance_number(self):
"""
获取参保人数
"""
# 查找参保人数表头
header_cell = None
for cell in self.table.query_selector_all("td.tb"):
if clean_text(cell.text_content()).find("参保人数") != -1:
header_cell = cell
break
if not header_cell:
return None
value_cell = header_cell.query_selector("+ td")
if not value_cell:
return None
# 提取参保人数数字
number_span = value_cell.query_selector("span")
number = clean_text(number_span.text_content()) if number_span else None
# 提取年报年份
report_link = value_cell.query_selector("a.m-l-r-10")
report_year = clean_text(report_link.text_content()) if report_link else ""
# 组合结果
return f"{number}{report_year}" if number else None
def get_phone_number(self):
"""
获取联系电话
"""
# 查找联系信息容器
contact_info = self.page.query_selector("div.contact-info")
if not contact_info:
return None
# 查找右侧信息区域
right_part = contact_info.query_selector("div.main-part-item.right")
if not right_part:
return None
# 查找包含电话的行
rows = right_part.query_selector_all("div.rline")
phone_row = None
for row in rows:
if clean_text(row.text_content()).find("电话:") != -1:
phone_row = row
break
if not phone_row:
return None
# 提取电话号码
spans = phone_row.query_selector_all("span.need-copy-field")
phone_span = None
for span in spans:
if clean_text(span.text_content()).find("电话:") == -1:
phone_span = span
break
return clean_text(phone_span.text_content()) if phone_span else None
def get_approval_date(self):
"""
获取核准日期
"""
return (self.get_optimized_value("核准日期") or
self.get_optimized_value("成立日期"))
def parse_company_info(self):
"""
解析公司信息主方法
"""
if not self.init_table():
return None
self.company_data = {
"企业名称": (self.get_optimized_value("企业名称") or
self.get_optimized_value("公司名称")),
"统一社会信用代码": self.get_unified_social_credit_code(),
"法定代表人": self.get_legal_representative(),
"经营状态": self.get_optimized_value("登记状态"),
"成立日期": self.get_optimized_value("成立日期"),
"行政区划": self.get_optimized_value("行政区划"),
"注册资本": self.get_optimized_value("注册资本"),
"实缴资本": self.get_optimized_value("实缴资本"),
"企业类型": self.get_optimized_value("企业类型"),
"所属行业": self.get_optimized_value("国标行业"),
"工商注册号": self.get_business_registration_no(),
"组织机构代码": self.get_organization_code(),
"纳税人识别号": self.get_taxpayer_id(),
"纳税人资质": self.get_optimized_value("纳税人资质"),
"营业期限": self.get_optimized_value("营业期限"),
"核准日期": self.get_approval_date(),
"参保人数": self.get_insurance_number(),
"电话": self.get_phone_number(),
"登记机关": self.get_optimized_value("登记机关"),
"曾用名": self.get_optimized_value("曾用名"),
"注册地址": self.get_optimized_value("注册地址"),
"经营范围": self.get_optimized_value("经营范围"),
}
return self.company_data
def load_cookies(context, cookie_file):
"""
从文件加载cookies
"""
if os.path.exists(cookie_file):
with open(cookie_file, 'r') as f:
cookies = json.load(f)
context.add_cookies(cookies)
print("已加载本地cookies")
return True
return False
def save_cookies(context, cookie_file):
"""
保存cookies到文件
"""
cookies = context.cookies()
with open(cookie_file, 'w') as f:
json.dump(cookies, f)
print("已保存cookies到文件")
def wait_for_login(page, cookie_file):
"""
等待用户扫码登录
"""
print("检测到需要登录,请使用手机扫码登录...")
print("登录成功后将自动跳转到目标页面")
# 等待页面跳转到非登录页面
page.wait_for_url("**/weblogin", timeout=3000)
page.wait_for_url(lambda url: "weblogin" not in url, timeout=120000)
# 保存登录后的cookies
save_cookies(page.context, cookie_file)
print("登录成功已保存cookies")
def main():
parser = argparse.ArgumentParser(description='解析企查查公司信息')
parser.add_argument('url', help='企查查公司页面URL')
parser.add_argument('--headless', action='store_true', help='无头模式运行')
parser.add_argument('--cookie-file', default='qcc_cookies.txt', help='cookies文件路径')
args = parser.parse_args()
with sync_playwright() as p:
# 启动浏览器
browser = p.chromium.launch(headless=args.headless)
context = browser.new_context()
page = context.new_page()
try:
# 尝试加载本地保存的cookies
if load_cookies(context, args.cookie_file):
print("使用已保存的登录信息")
# 访问指定URL
page.goto(args.url)
# 检查是否跳转到了登录页面
if "weblogin" in page.url:
wait_for_login(page, args.cookie_file)
else:
print("已登录或无需登录")
# 重新访问目标URL确保页面正确加载
page.goto(args.url)
# 创建解析器并解析信息
parser = QCCParser(page)
company_info = parser.parse_company_info()
if company_info:
# 格式化输出JSON
print(json.dumps(company_info, ensure_ascii=False, indent=2))
else:
print("未能获取公司信息")
except Exception as e:
print(f"发生错误: {e}")
finally:
browser.close()
if __name__ == "__main__":
main()
# python qcc.py "https://www.qcc.com/firm/50b0e3189f2eb2b20304b255669ce1a1.html"
# # 首次运行需要扫码登录
# python qcc.py "https://www.qcc.com/firm/公司URL"
#
# # 后续运行将自动使用已保存的登录信息
# python qcc.py "https://www.qcc.com/firm/公司URL"
#
# # 指定自定义cookies文件
# python qcc.py --cookie-file my_cookies.txt "https://www.qcc.com/firm/公司URL"