# qcc.py import json import re import os from playwright.sync_api import sync_playwright import argparse def clean_text(text): """ 清理文本内容,去除多余空白字符 """ if not text: return "" # 替换多个空白字符为单个空格,并去除首尾空白 return re.sub(r'\s+', ' ', text.replace('\r', '').replace('\n', '').replace('\t', '')).strip() class QCCParser: def __init__(self, page): self.page = page self.company_data = {} def init_table(self): """ 初始化表格元素 """ # 等待页面加载完成 self.page.wait_for_load_state('networkidle') # 查找企业信息容器 cominfo_normal = self.page.query_selector("div.cominfo-normal") if not cominfo_normal: print("未找到企业信息容器") return False # 查找企业信息表格 self.table = cominfo_normal.query_selector("table.ntable") if not self.table: print("未找到企业信息表格") return False return True def get_optimized_value(self, title): """ 根据标题获取对应的值 """ # 查找所有表头单元格 header_cells = self.table.query_selector_all("td.tb") value = None for header in header_cells: if clean_text(header.text_content()).find(title) != -1: value_cell = header.query_selector("+ td") if value_cell: # 尝试从copy-value类中获取值 copy_value = value_cell.query_selector(".copy-value") if copy_value: value = clean_text(copy_value.text_content()) else: value = clean_text(value_cell.text_content()) break return value def get_legal_representative(self): """ 获取法定代表人 """ # 尝试基本方法获取 basic_value = self.get_optimized_value("法定代表人") or self.get_optimized_value("法人") if basic_value and basic_value.strip(): # 移除"关联企业 X"等附加信息 return re.sub(r'\s*关联企业\s*\d+$', '', basic_value).strip() # 查找法定代表人表头 header_cell = None for cell in self.table.query_selector_all("td.tb"): if clean_text(cell.text_content()).find("法定代表人") != -1: header_cell = cell break if not header_cell: return None value_cell = header_cell.query_selector("+ td") if not value_cell: return None # 查找包含法定代表人姓名的链接 name_links = value_cell.query_selector_all('a[target="_blank"]') for link in name_links: name = clean_text(link.text_content()) # 确保不是空值且不包含非姓名文本 if name and "关联企业" not in name and "复制" not in name: return name # 备选方案:查找第一个链接 first_link = value_cell.query_selector("a") if first_link: name = clean_text(first_link.text_content()) return re.sub(r'\s*关联企业\s*\d+$', '', name).strip() # 最后备选方案:直接提取文本 copy_value = value_cell.query_selector(".copy-value") if copy_value: name = clean_text(copy_value.text_content()) return re.sub(r'\s*关联企业\s*\d+$', '', name).strip() raw_text = clean_text(value_cell.text_content()) return re.sub(r'\s*关联企业\s*\d+$', '', raw_text).strip() def get_unified_social_credit_code(self): """ 获取统一社会信用代码 """ return (self.get_optimized_value("统一社会信用代码") or self.get_optimized_value("信用代码")) def get_business_registration_no(self): """ 获取工商注册号 """ return (self.get_optimized_value("工商注册号") or self.get_optimized_value("注册号")) def get_organization_code(self): """ 获取组织机构代码 """ return self.get_optimized_value("组织机构代码") def get_taxpayer_id(self): """ 获取纳税人识别号 """ return (self.get_optimized_value("纳税人识别号") or self.get_unified_social_credit_code()) def get_insurance_number(self): """ 获取参保人数 """ # 查找参保人数表头 header_cell = None for cell in self.table.query_selector_all("td.tb"): if clean_text(cell.text_content()).find("参保人数") != -1: header_cell = cell break if not header_cell: return None value_cell = header_cell.query_selector("+ td") if not value_cell: return None # 提取参保人数数字 number_span = value_cell.query_selector("span") number = clean_text(number_span.text_content()) if number_span else None # 提取年报年份 report_link = value_cell.query_selector("a.m-l-r-10") report_year = clean_text(report_link.text_content()) if report_link else "" # 组合结果 return f"{number}人 {report_year}" if number else None def get_phone_number(self): """ 获取联系电话 """ # 查找联系信息容器 contact_info = self.page.query_selector("div.contact-info") if not contact_info: return None # 查找右侧信息区域 right_part = contact_info.query_selector("div.main-part-item.right") if not right_part: return None # 查找包含电话的行 rows = right_part.query_selector_all("div.rline") phone_row = None for row in rows: if clean_text(row.text_content()).find("电话:") != -1: phone_row = row break if not phone_row: return None # 提取电话号码 spans = phone_row.query_selector_all("span.need-copy-field") phone_span = None for span in spans: if clean_text(span.text_content()).find("电话:") == -1: phone_span = span break return clean_text(phone_span.text_content()) if phone_span else None def get_approval_date(self): """ 获取核准日期 """ return (self.get_optimized_value("核准日期") or self.get_optimized_value("成立日期")) def parse_company_info(self): """ 解析公司信息主方法 """ if not self.init_table(): return None self.company_data = { "企业名称": (self.get_optimized_value("企业名称") or self.get_optimized_value("公司名称")), "统一社会信用代码": self.get_unified_social_credit_code(), "法定代表人": self.get_legal_representative(), "经营状态": self.get_optimized_value("登记状态"), "成立日期": self.get_optimized_value("成立日期"), "行政区划": self.get_optimized_value("行政区划"), "注册资本": self.get_optimized_value("注册资本"), "实缴资本": self.get_optimized_value("实缴资本"), "企业类型": self.get_optimized_value("企业类型"), "所属行业": self.get_optimized_value("国标行业"), "工商注册号": self.get_business_registration_no(), "组织机构代码": self.get_organization_code(), "纳税人识别号": self.get_taxpayer_id(), "纳税人资质": self.get_optimized_value("纳税人资质"), "营业期限": self.get_optimized_value("营业期限"), "核准日期": self.get_approval_date(), "参保人数": self.get_insurance_number(), "电话": self.get_phone_number(), "登记机关": self.get_optimized_value("登记机关"), "曾用名": self.get_optimized_value("曾用名"), "注册地址": self.get_optimized_value("注册地址"), "经营范围": self.get_optimized_value("经营范围"), } return self.company_data def load_cookies(context, cookie_file): """ 从文件加载cookies """ if os.path.exists(cookie_file): with open(cookie_file, 'r') as f: cookies = json.load(f) context.add_cookies(cookies) print("已加载本地cookies") return True return False def save_cookies(context, cookie_file): """ 保存cookies到文件 """ cookies = context.cookies() with open(cookie_file, 'w') as f: json.dump(cookies, f) print("已保存cookies到文件") def wait_for_login(page, cookie_file): """ 等待用户扫码登录 """ print("检测到需要登录,请使用手机扫码登录...") print("登录成功后将自动跳转到目标页面") # 等待页面跳转到非登录页面 page.wait_for_url("**/weblogin", timeout=3000) page.wait_for_url(lambda url: "weblogin" not in url, timeout=120000) # 保存登录后的cookies save_cookies(page.context, cookie_file) print("登录成功,已保存cookies") def main(): parser = argparse.ArgumentParser(description='解析企查查公司信息') parser.add_argument('url', help='企查查公司页面URL') parser.add_argument('--headless', action='store_true', help='无头模式运行') parser.add_argument('--cookie-file', default='qcc_cookies.txt', help='cookies文件路径') args = parser.parse_args() with sync_playwright() as p: # 启动浏览器 browser = p.chromium.launch(headless=args.headless) context = browser.new_context() page = context.new_page() try: # 尝试加载本地保存的cookies if load_cookies(context, args.cookie_file): print("使用已保存的登录信息") # 访问指定URL page.goto(args.url) # 检查是否跳转到了登录页面 if "weblogin" in page.url: wait_for_login(page, args.cookie_file) else: print("已登录或无需登录") # 重新访问目标URL(确保页面正确加载) page.goto(args.url) # 创建解析器并解析信息 parser = QCCParser(page) company_info = parser.parse_company_info() if company_info: # 格式化输出JSON print(json.dumps(company_info, ensure_ascii=False, indent=2)) else: print("未能获取公司信息") except Exception as e: print(f"发生错误: {e}") finally: browser.close() if __name__ == "__main__": main() # python qcc.py "https://www.qcc.com/firm/50b0e3189f2eb2b20304b255669ce1a1.html" # # 首次运行需要扫码登录 # python qcc.py "https://www.qcc.com/firm/公司URL" # # # 后续运行将自动使用已保存的登录信息 # python qcc.py "https://www.qcc.com/firm/公司URL" # # # 指定自定义cookies文件 # python qcc.py --cookie-file my_cookies.txt "https://www.qcc.com/firm/公司URL"