# file: tool/aiqicha_detail_parser.py import re class AiqichaDetailParser: """爱企查企业详情页解析器""" def __init__(self, page): """ 初始化解析器 Args: page: 浏览器页面对象 """ self.page = page def parse_company_info(self): """ 解析页面中的企业基本信息,参考 AiQiChaParser 实现 Returns: dict: 包含企业基本信息的字典 """ company_info = {} # 定义要提取的信息字段和对应的CSS选择器 fields = { 'name': ['.company-name', '.enterprise-name'], 'credit_code': ['.credit-code', '.unified-social-credit-code'], 'legal_representative': ['.legal-person', '.legal-representative'], 'registered_capital': ['.reg-capital', '.registered-capital'], 'establishment_date': ['.establishment-date', '.setup-date'], 'business_status': ['.business-status', '.operating-state'], 'address': ['.address', '.registered-address'], 'business_scope': ['.business-scope', '.business-scope-content'], 'company_type': ['.company-type', '.enterprise-type'], 'industry': ['.industry', '.industry-category'], 'registration_authority': ['.registration-authority', '.register-authority'], 'operating_period': ['.operating-period', '.business-period'], 'actual_capital': ['.actual-capital', '.paid-capital'], 'taxpayer_id': ['.taxpayer-id', '.tax-id-number'], 'organization_code': ['.organization-code'], 'english_name': ['.english-name'], 'approved_date': ['.approved-date', '.approval-date'], 'staff_size': ['.staff-size', '.insured-persons'], 'former_name': ['.former-name', '.previous-name'] } # 批量提取信息 for field, selectors in fields.items(): company_info[field] = self._extract_field_value(selectors) # 特殊处理电话号码 company_info['phone'] = self._extract_phone_number() return company_info def _extract_field_value(self, selectors): """ 根据多个选择器提取字段值 Args: selectors (list): CSS选择器列表 Returns: str: 提取到的值或"未知" """ for selector in selectors: try: # 添加日志:显示当前尝试的选择器 print(f"尝试选择器: {selector}") # 尝试查找带有 enter-bg-ele 类的元素 element = self.page.query_selector(f"{selector} .enter-bg-ele") if element: print(f"找到 enter-bg-ele 元素,选择器: {selector} .enter-bg-ele") else: # 尝试查找带有 addr-enter-bg-ele 类的元素 element = self.page.query_selector(f"{selector} .addr-enter-bg-ele") if element: print(f"找到 addr-enter-bg-ele 元素,选择器: {selector} .addr-enter-bg-ele") else: # 直接查找元素 element = self.page.query_selector(selector) if element: print(f"找到直接元素,选择器: {selector}") if element: text = element.inner_text().strip() print(f"提取到原始文本: '{text}'") # 清理文本内容 text = self._clean_text(text) print(f"清理后文本: '{text}'") if text: print(f"返回文本: '{text}'") return text else: print("文本为空或仅包含空白字符") else: print(f"未找到元素,选择器: {selector}") except Exception as e: print(f"提取字段时出错,选择器: {selector}, 错误: {e}") continue print("所有选择器都未找到有效元素,返回默认值") return "未知" def _clean_text(self, text): """ 清理文本内容 Args: text (str): 原始文本 Returns: str: 清理后的文本 """ # 移除多余的空白字符 text = re.sub(r'\s+', ' ', text) # 移除换行符和制表符 text = re.sub(r'[\r\n\t]', '', text) return text.strip() def _extract_phone_number(self): """ 提取电话号码信息 Returns: str: 电话号码或"未知" """ try: # 查找电话信息容器 phone_container = self.page.query_selector("div.business-info div.telphone-lists-wrap") if phone_container: # 查找包含电话号码的元素 phone_element = phone_container.query_selector("span.copy-box span") if phone_element: return self._clean_text(phone_element.inner_text()) except Exception: pass return "未知"