143 lines
5.2 KiB
Python
143 lines
5.2 KiB
Python
# file: tool/aiqicha_detail_parser.py
|
|
|
|
import re
|
|
|
|
|
|
class AiqichaDetailParser:
|
|
"""爱企查企业详情页解析器"""
|
|
|
|
def __init__(self, page):
|
|
"""
|
|
初始化解析器
|
|
|
|
Args:
|
|
page: 浏览器页面对象
|
|
"""
|
|
self.page = page
|
|
|
|
def parse_company_info(self):
|
|
"""
|
|
解析页面中的企业基本信息,参考 AiQiChaParser 实现
|
|
|
|
Returns:
|
|
dict: 包含企业基本信息的字典
|
|
"""
|
|
company_info = {}
|
|
|
|
# 定义要提取的信息字段和对应的CSS选择器
|
|
fields = {
|
|
'name': ['.company-name', '.enterprise-name'],
|
|
'credit_code': ['.credit-code', '.unified-social-credit-code'],
|
|
'legal_representative': ['.legal-person', '.legal-representative'],
|
|
'registered_capital': ['.reg-capital', '.registered-capital'],
|
|
'establishment_date': ['.establishment-date', '.setup-date'],
|
|
'business_status': ['.business-status', '.operating-state'],
|
|
'address': ['.address', '.registered-address'],
|
|
'business_scope': ['.business-scope', '.business-scope-content'],
|
|
'company_type': ['.company-type', '.enterprise-type'],
|
|
'industry': ['.industry', '.industry-category'],
|
|
'registration_authority': ['.registration-authority', '.register-authority'],
|
|
'operating_period': ['.operating-period', '.business-period'],
|
|
'actual_capital': ['.actual-capital', '.paid-capital'],
|
|
'taxpayer_id': ['.taxpayer-id', '.tax-id-number'],
|
|
'organization_code': ['.organization-code'],
|
|
'english_name': ['.english-name'],
|
|
'approved_date': ['.approved-date', '.approval-date'],
|
|
'staff_size': ['.staff-size', '.insured-persons'],
|
|
'former_name': ['.former-name', '.previous-name']
|
|
}
|
|
|
|
# 批量提取信息
|
|
for field, selectors in fields.items():
|
|
company_info[field] = self._extract_field_value(selectors)
|
|
|
|
# 特殊处理电话号码
|
|
company_info['phone'] = self._extract_phone_number()
|
|
|
|
return company_info
|
|
|
|
def _extract_field_value(self, selectors):
|
|
"""
|
|
根据多个选择器提取字段值
|
|
|
|
Args:
|
|
selectors (list): CSS选择器列表
|
|
|
|
Returns:
|
|
str: 提取到的值或"未知"
|
|
"""
|
|
for selector in selectors:
|
|
try:
|
|
# 添加日志:显示当前尝试的选择器
|
|
print(f"尝试选择器: {selector}")
|
|
|
|
# 尝试查找带有 enter-bg-ele 类的元素
|
|
element = self.page.query_selector(f"{selector} .enter-bg-ele")
|
|
if element:
|
|
print(f"找到 enter-bg-ele 元素,选择器: {selector} .enter-bg-ele")
|
|
else:
|
|
# 尝试查找带有 addr-enter-bg-ele 类的元素
|
|
element = self.page.query_selector(f"{selector} .addr-enter-bg-ele")
|
|
if element:
|
|
print(f"找到 addr-enter-bg-ele 元素,选择器: {selector} .addr-enter-bg-ele")
|
|
else:
|
|
# 直接查找元素
|
|
element = self.page.query_selector(selector)
|
|
if element:
|
|
print(f"找到直接元素,选择器: {selector}")
|
|
|
|
if element:
|
|
text = element.inner_text().strip()
|
|
print(f"提取到原始文本: '{text}'")
|
|
# 清理文本内容
|
|
text = self._clean_text(text)
|
|
print(f"清理后文本: '{text}'")
|
|
if text:
|
|
print(f"返回文本: '{text}'")
|
|
return text
|
|
else:
|
|
print("文本为空或仅包含空白字符")
|
|
else:
|
|
print(f"未找到元素,选择器: {selector}")
|
|
except Exception as e:
|
|
print(f"提取字段时出错,选择器: {selector}, 错误: {e}")
|
|
continue
|
|
|
|
print("所有选择器都未找到有效元素,返回默认值")
|
|
return "未知"
|
|
|
|
def _clean_text(self, text):
|
|
"""
|
|
清理文本内容
|
|
|
|
Args:
|
|
text (str): 原始文本
|
|
|
|
Returns:
|
|
str: 清理后的文本
|
|
"""
|
|
# 移除多余的空白字符
|
|
text = re.sub(r'\s+', ' ', text)
|
|
# 移除换行符和制表符
|
|
text = re.sub(r'[\r\n\t]', '', text)
|
|
return text.strip()
|
|
|
|
def _extract_phone_number(self):
|
|
"""
|
|
提取电话号码信息
|
|
|
|
Returns:
|
|
str: 电话号码或"未知"
|
|
"""
|
|
try:
|
|
# 查找电话信息容器
|
|
phone_container = self.page.query_selector("div.business-info div.telphone-lists-wrap")
|
|
if phone_container:
|
|
# 查找包含电话号码的元素
|
|
phone_element = phone_container.query_selector("span.copy-box span")
|
|
if phone_element:
|
|
return self._clean_text(phone_element.inner_text())
|
|
except Exception:
|
|
pass
|
|
return "未知"
|