Files
SearchCompany/tool/aiqicha_detail_parser.py
manchuwork 102dd78c26 aiqicha
2025-09-25 03:19:34 +08:00

143 lines
5.2 KiB
Python

# file: tool/aiqicha_detail_parser.py
import re
class AiqichaDetailParser:
"""爱企查企业详情页解析器"""
def __init__(self, page):
"""
初始化解析器
Args:
page: 浏览器页面对象
"""
self.page = page
def parse_company_info(self):
"""
解析页面中的企业基本信息,参考 AiQiChaParser 实现
Returns:
dict: 包含企业基本信息的字典
"""
company_info = {}
# 定义要提取的信息字段和对应的CSS选择器
fields = {
'name': ['.company-name', '.enterprise-name'],
'credit_code': ['.credit-code', '.unified-social-credit-code'],
'legal_representative': ['.legal-person', '.legal-representative'],
'registered_capital': ['.reg-capital', '.registered-capital'],
'establishment_date': ['.establishment-date', '.setup-date'],
'business_status': ['.business-status', '.operating-state'],
'address': ['.address', '.registered-address'],
'business_scope': ['.business-scope', '.business-scope-content'],
'company_type': ['.company-type', '.enterprise-type'],
'industry': ['.industry', '.industry-category'],
'registration_authority': ['.registration-authority', '.register-authority'],
'operating_period': ['.operating-period', '.business-period'],
'actual_capital': ['.actual-capital', '.paid-capital'],
'taxpayer_id': ['.taxpayer-id', '.tax-id-number'],
'organization_code': ['.organization-code'],
'english_name': ['.english-name'],
'approved_date': ['.approved-date', '.approval-date'],
'staff_size': ['.staff-size', '.insured-persons'],
'former_name': ['.former-name', '.previous-name']
}
# 批量提取信息
for field, selectors in fields.items():
company_info[field] = self._extract_field_value(selectors)
# 特殊处理电话号码
company_info['phone'] = self._extract_phone_number()
return company_info
def _extract_field_value(self, selectors):
"""
根据多个选择器提取字段值
Args:
selectors (list): CSS选择器列表
Returns:
str: 提取到的值或"未知"
"""
for selector in selectors:
try:
# 添加日志:显示当前尝试的选择器
print(f"尝试选择器: {selector}")
# 尝试查找带有 enter-bg-ele 类的元素
element = self.page.query_selector(f"{selector} .enter-bg-ele")
if element:
print(f"找到 enter-bg-ele 元素,选择器: {selector} .enter-bg-ele")
else:
# 尝试查找带有 addr-enter-bg-ele 类的元素
element = self.page.query_selector(f"{selector} .addr-enter-bg-ele")
if element:
print(f"找到 addr-enter-bg-ele 元素,选择器: {selector} .addr-enter-bg-ele")
else:
# 直接查找元素
element = self.page.query_selector(selector)
if element:
print(f"找到直接元素,选择器: {selector}")
if element:
text = element.inner_text().strip()
print(f"提取到原始文本: '{text}'")
# 清理文本内容
text = self._clean_text(text)
print(f"清理后文本: '{text}'")
if text:
print(f"返回文本: '{text}'")
return text
else:
print("文本为空或仅包含空白字符")
else:
print(f"未找到元素,选择器: {selector}")
except Exception as e:
print(f"提取字段时出错,选择器: {selector}, 错误: {e}")
continue
print("所有选择器都未找到有效元素,返回默认值")
return "未知"
def _clean_text(self, text):
"""
清理文本内容
Args:
text (str): 原始文本
Returns:
str: 清理后的文本
"""
# 移除多余的空白字符
text = re.sub(r'\s+', ' ', text)
# 移除换行符和制表符
text = re.sub(r'[\r\n\t]', '', text)
return text.strip()
def _extract_phone_number(self):
"""
提取电话号码信息
Returns:
str: 电话号码或"未知"
"""
try:
# 查找电话信息容器
phone_container = self.page.query_selector("div.business-info div.telphone-lists-wrap")
if phone_container:
# 查找包含电话号码的元素
phone_element = phone_container.query_selector("span.copy-box span")
if phone_element:
return self._clean_text(phone_element.inner_text())
except Exception:
pass
return "未知"