333 lines
11 KiB
Python
333 lines
11 KiB
Python
# qcc.py
|
||
import json
|
||
import re
|
||
import os
|
||
from playwright.sync_api import sync_playwright
|
||
import argparse
|
||
|
||
|
||
def clean_text(text):
|
||
"""
|
||
清理文本内容,去除多余空白字符
|
||
"""
|
||
if not text:
|
||
return ""
|
||
# 替换多个空白字符为单个空格,并去除首尾空白
|
||
return re.sub(r'\s+', ' ', text.replace('\r', '').replace('\n', '').replace('\t', '')).strip()
|
||
|
||
|
||
class QCCParser:
|
||
def __init__(self, page):
|
||
self.page = page
|
||
self.company_data = {}
|
||
|
||
def init_table(self):
|
||
"""
|
||
初始化表格元素
|
||
"""
|
||
# 等待页面加载完成
|
||
self.page.wait_for_load_state('networkidle')
|
||
|
||
# 查找企业信息容器
|
||
cominfo_normal = self.page.query_selector("div.cominfo-normal")
|
||
if not cominfo_normal:
|
||
print("未找到企业信息容器")
|
||
return False
|
||
|
||
# 查找企业信息表格
|
||
self.table = cominfo_normal.query_selector("table.ntable")
|
||
if not self.table:
|
||
print("未找到企业信息表格")
|
||
return False
|
||
|
||
return True
|
||
|
||
def get_optimized_value(self, title):
|
||
"""
|
||
根据标题获取对应的值
|
||
"""
|
||
# 查找所有表头单元格
|
||
header_cells = self.table.query_selector_all("td.tb")
|
||
value = None
|
||
|
||
for header in header_cells:
|
||
if clean_text(header.text_content()).find(title) != -1:
|
||
value_cell = header.query_selector("+ td")
|
||
if value_cell:
|
||
# 尝试从copy-value类中获取值
|
||
copy_value = value_cell.query_selector(".copy-value")
|
||
if copy_value:
|
||
value = clean_text(copy_value.text_content())
|
||
else:
|
||
value = clean_text(value_cell.text_content())
|
||
break
|
||
|
||
return value
|
||
|
||
def get_legal_representative(self):
|
||
"""
|
||
获取法定代表人
|
||
"""
|
||
# 尝试基本方法获取
|
||
basic_value = self.get_optimized_value("法定代表人") or self.get_optimized_value("法人")
|
||
if basic_value and basic_value.strip():
|
||
# 移除"关联企业 X"等附加信息
|
||
return re.sub(r'\s*关联企业\s*\d+$', '', basic_value).strip()
|
||
|
||
# 查找法定代表人表头
|
||
header_cell = None
|
||
for cell in self.table.query_selector_all("td.tb"):
|
||
if clean_text(cell.text_content()).find("法定代表人") != -1:
|
||
header_cell = cell
|
||
break
|
||
|
||
if not header_cell:
|
||
return None
|
||
|
||
value_cell = header_cell.query_selector("+ td")
|
||
if not value_cell:
|
||
return None
|
||
|
||
# 查找包含法定代表人姓名的链接
|
||
name_links = value_cell.query_selector_all('a[target="_blank"]')
|
||
for link in name_links:
|
||
name = clean_text(link.text_content())
|
||
# 确保不是空值且不包含非姓名文本
|
||
if name and "关联企业" not in name and "复制" not in name:
|
||
return name
|
||
|
||
# 备选方案:查找第一个链接
|
||
first_link = value_cell.query_selector("a")
|
||
if first_link:
|
||
name = clean_text(first_link.text_content())
|
||
return re.sub(r'\s*关联企业\s*\d+$', '', name).strip()
|
||
|
||
# 最后备选方案:直接提取文本
|
||
copy_value = value_cell.query_selector(".copy-value")
|
||
if copy_value:
|
||
name = clean_text(copy_value.text_content())
|
||
return re.sub(r'\s*关联企业\s*\d+$', '', name).strip()
|
||
|
||
raw_text = clean_text(value_cell.text_content())
|
||
return re.sub(r'\s*关联企业\s*\d+$', '', raw_text).strip()
|
||
|
||
def get_unified_social_credit_code(self):
|
||
"""
|
||
获取统一社会信用代码
|
||
"""
|
||
return (self.get_optimized_value("统一社会信用代码") or
|
||
self.get_optimized_value("信用代码"))
|
||
|
||
def get_business_registration_no(self):
|
||
"""
|
||
获取工商注册号
|
||
"""
|
||
return (self.get_optimized_value("工商注册号") or
|
||
self.get_optimized_value("注册号"))
|
||
|
||
def get_organization_code(self):
|
||
"""
|
||
获取组织机构代码
|
||
"""
|
||
return self.get_optimized_value("组织机构代码")
|
||
|
||
def get_taxpayer_id(self):
|
||
"""
|
||
获取纳税人识别号
|
||
"""
|
||
return (self.get_optimized_value("纳税人识别号") or
|
||
self.get_unified_social_credit_code())
|
||
|
||
def get_insurance_number(self):
|
||
"""
|
||
获取参保人数
|
||
"""
|
||
# 查找参保人数表头
|
||
header_cell = None
|
||
for cell in self.table.query_selector_all("td.tb"):
|
||
if clean_text(cell.text_content()).find("参保人数") != -1:
|
||
header_cell = cell
|
||
break
|
||
|
||
if not header_cell:
|
||
return None
|
||
|
||
value_cell = header_cell.query_selector("+ td")
|
||
if not value_cell:
|
||
return None
|
||
|
||
# 提取参保人数数字
|
||
number_span = value_cell.query_selector("span")
|
||
number = clean_text(number_span.text_content()) if number_span else None
|
||
|
||
# 提取年报年份
|
||
report_link = value_cell.query_selector("a.m-l-r-10")
|
||
report_year = clean_text(report_link.text_content()) if report_link else ""
|
||
|
||
# 组合结果
|
||
return f"{number}人 {report_year}" if number else None
|
||
|
||
def get_phone_number(self):
|
||
"""
|
||
获取联系电话
|
||
"""
|
||
# 查找联系信息容器
|
||
contact_info = self.page.query_selector("div.contact-info")
|
||
if not contact_info:
|
||
return None
|
||
|
||
# 查找右侧信息区域
|
||
right_part = contact_info.query_selector("div.main-part-item.right")
|
||
if not right_part:
|
||
return None
|
||
|
||
# 查找包含电话的行
|
||
rows = right_part.query_selector_all("div.rline")
|
||
phone_row = None
|
||
for row in rows:
|
||
if clean_text(row.text_content()).find("电话:") != -1:
|
||
phone_row = row
|
||
break
|
||
|
||
if not phone_row:
|
||
return None
|
||
|
||
# 提取电话号码
|
||
spans = phone_row.query_selector_all("span.need-copy-field")
|
||
phone_span = None
|
||
for span in spans:
|
||
if clean_text(span.text_content()).find("电话:") == -1:
|
||
phone_span = span
|
||
break
|
||
|
||
return clean_text(phone_span.text_content()) if phone_span else None
|
||
|
||
def get_approval_date(self):
|
||
"""
|
||
获取核准日期
|
||
"""
|
||
return (self.get_optimized_value("核准日期") or
|
||
self.get_optimized_value("成立日期"))
|
||
|
||
def parse_company_info(self):
|
||
"""
|
||
解析公司信息主方法
|
||
"""
|
||
if not self.init_table():
|
||
return None
|
||
|
||
self.company_data = {
|
||
"企业名称": (self.get_optimized_value("企业名称") or
|
||
self.get_optimized_value("公司名称")),
|
||
"统一社会信用代码": self.get_unified_social_credit_code(),
|
||
"法定代表人": self.get_legal_representative(),
|
||
"经营状态": self.get_optimized_value("登记状态"),
|
||
"成立日期": self.get_optimized_value("成立日期"),
|
||
"行政区划": self.get_optimized_value("行政区划"),
|
||
"注册资本": self.get_optimized_value("注册资本"),
|
||
"实缴资本": self.get_optimized_value("实缴资本"),
|
||
"企业类型": self.get_optimized_value("企业类型"),
|
||
"所属行业": self.get_optimized_value("国标行业"),
|
||
"工商注册号": self.get_business_registration_no(),
|
||
"组织机构代码": self.get_organization_code(),
|
||
"纳税人识别号": self.get_taxpayer_id(),
|
||
"纳税人资质": self.get_optimized_value("纳税人资质"),
|
||
"营业期限": self.get_optimized_value("营业期限"),
|
||
"核准日期": self.get_approval_date(),
|
||
"参保人数": self.get_insurance_number(),
|
||
"电话": self.get_phone_number(),
|
||
"登记机关": self.get_optimized_value("登记机关"),
|
||
"曾用名": self.get_optimized_value("曾用名"),
|
||
"注册地址": self.get_optimized_value("注册地址"),
|
||
"经营范围": self.get_optimized_value("经营范围"),
|
||
}
|
||
|
||
return self.company_data
|
||
|
||
|
||
def load_cookies(context, cookie_file):
|
||
"""
|
||
从文件加载cookies
|
||
"""
|
||
if os.path.exists(cookie_file):
|
||
with open(cookie_file, 'r') as f:
|
||
cookies = json.load(f)
|
||
context.add_cookies(cookies)
|
||
print("已加载本地cookies")
|
||
return True
|
||
return False
|
||
|
||
|
||
def save_cookies(context, cookie_file):
|
||
"""
|
||
保存cookies到文件
|
||
"""
|
||
cookies = context.cookies()
|
||
with open(cookie_file, 'w') as f:
|
||
json.dump(cookies, f)
|
||
print("已保存cookies到文件")
|
||
|
||
|
||
def wait_for_login_and_save_cookies(page, cookie_file):
|
||
"""
|
||
等待用户扫码登录并保存cookies
|
||
"""
|
||
print("检测到需要登录,请使用手机扫码登录...")
|
||
print("登录成功后将自动跳转到目标页面")
|
||
|
||
# 等待页面跳转到非登录页面(即跳转回firm页面)
|
||
page.wait_for_url("**/firm/**", timeout=120000)
|
||
|
||
# 保存登录后的cookies
|
||
save_cookies(page.context, cookie_file)
|
||
print("登录成功,已保存cookies")
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description='解析企查查公司信息')
|
||
parser.add_argument('url', help='企查查公司页面URL')
|
||
parser.add_argument('--headless', action='store_true', help='无头模式运行')
|
||
parser.add_argument('--cookie-file', default='qcc_cookies.txt', help='cookies文件路径')
|
||
|
||
args = parser.parse_args()
|
||
|
||
with sync_playwright() as p:
|
||
# 启动浏览器
|
||
browser = p.chromium.launch(headless=args.headless)
|
||
context = browser.new_context()
|
||
page = context.new_page()
|
||
|
||
try:
|
||
# 启动应用时自动加载cookies文件
|
||
load_cookies(context, args.cookie_file)
|
||
|
||
# 访问指定URL
|
||
page.goto(args.url)
|
||
|
||
# 检查是否在登录页面
|
||
if "weblogin" in page.url:
|
||
# 等待用户扫码登录并自动保存cookies
|
||
wait_for_login_and_save_cookies(page, args.cookie_file)
|
||
else:
|
||
print("已登录或无需登录")
|
||
|
||
# 创建解析器并解析信息
|
||
parser = QCCParser(page)
|
||
company_info = parser.parse_company_info()
|
||
|
||
if company_info:
|
||
# 格式化输出JSON
|
||
print(json.dumps(company_info, ensure_ascii=False, indent=2))
|
||
else:
|
||
print("未能获取公司信息")
|
||
|
||
except Exception as e:
|
||
print(f"发生错误: {e}")
|
||
finally:
|
||
browser.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
# python qcc.py "https://www.qcc.com/firm/50b0e3189f2eb2b20304b255669ce1a1.html"
|