aiqicha
This commit is contained in:
@@ -15,6 +15,24 @@ def query_init_company_data(csv_file_name):
|
|||||||
print("所有数据:", all_data)
|
print("所有数据:", all_data)
|
||||||
return all_data
|
return all_data
|
||||||
|
|
||||||
|
|
||||||
|
def parse_operating_period(period_str):
|
||||||
|
"""
|
||||||
|
解析营业期限字符串,返回开始日期和结束日期
|
||||||
|
示例输入:"2020-01-01至2030-12-31" 或 "2020-01-01起长期"
|
||||||
|
"""
|
||||||
|
if not period_str:
|
||||||
|
return '', ''
|
||||||
|
|
||||||
|
if '至' in period_str:
|
||||||
|
parts = period_str.split('至')
|
||||||
|
return str(parts[0].strip()), str(parts[1].strip())
|
||||||
|
elif '起' in period_str:
|
||||||
|
parts = period_str.split('起')
|
||||||
|
return str(parts[0].strip()), '长期'
|
||||||
|
else:
|
||||||
|
return str(period_str.strip()), ''
|
||||||
|
|
||||||
def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
||||||
"""
|
"""
|
||||||
从CSV文件中读取爱企查URL,爬取企业详情,并保存到新的CSV文件中
|
从CSV文件中读取爱企查URL,爬取企业详情,并保存到新的CSV文件中
|
||||||
@@ -41,18 +59,21 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
|||||||
'address', # 公司地址
|
'address', # 公司地址
|
||||||
'business_scope', # 经营范围
|
'business_scope', # 经营范围
|
||||||
'source_url', # 原始URL
|
'source_url', # 原始URL
|
||||||
'create_time' ,
|
|
||||||
# 新增字段
|
# 新增字段
|
||||||
'company_type', # 企业类型
|
'company_type', # 企业类型
|
||||||
'industry', # 所属行业
|
'industry', # 所属行业
|
||||||
'registration_authority', # 登记机关
|
'registration_authority', # 登记机关
|
||||||
'operating_period', # 营业期限
|
'operating_period', # 营业期限
|
||||||
|
'operating_start_date', # 营业期限开始日期
|
||||||
|
'operating_end_date', # 营业期限结束日期
|
||||||
'actual_capital', # 实缴资本
|
'actual_capital', # 实缴资本
|
||||||
'taxpayer_id', # 纳税人识别号
|
'taxpayer_id', # 纳税人识别号
|
||||||
'organization_code', # 组织机构代码
|
'organization_code', # 组织机构代码
|
||||||
'approved_date', # 核准日期
|
'approved_date', # 核准日期
|
||||||
'staff_size', # 参保人数
|
'staff_size', # 参保人数
|
||||||
'phone' # 电话
|
'phone', # 电话,
|
||||||
|
'create_time',
|
||||||
# 创建时间
|
# 创建时间
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -104,6 +125,11 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
|||||||
|
|
||||||
if detail:
|
if detail:
|
||||||
# 添加来源URL和公司名称
|
# 添加来源URL和公司名称
|
||||||
|
|
||||||
|
# 解析营业期限字段
|
||||||
|
operating_period = detail.get('operating_period', '')
|
||||||
|
start_date, end_date = parse_operating_period(operating_period)
|
||||||
|
|
||||||
detail['source_url'] = url
|
detail['source_url'] = url
|
||||||
# 转换字段名以匹配CSV表头
|
# 转换字段名以匹配CSV表头
|
||||||
converted_item = {
|
converted_item = {
|
||||||
@@ -121,6 +147,8 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
|||||||
'industry': detail.get('industry', ''),
|
'industry': detail.get('industry', ''),
|
||||||
'registration_authority': detail.get('registration_authority', ''),
|
'registration_authority': detail.get('registration_authority', ''),
|
||||||
'operating_period': detail.get('operating_period', ''),
|
'operating_period': detail.get('operating_period', ''),
|
||||||
|
'operating_start_date': str(start_date),
|
||||||
|
'operating_end_date': str(end_date),
|
||||||
'actual_capital': detail.get('actual_capital', ''),
|
'actual_capital': detail.get('actual_capital', ''),
|
||||||
'taxpayer_id': detail.get('taxpayer_id', ''),
|
'taxpayer_id': detail.get('taxpayer_id', ''),
|
||||||
'organization_code': detail.get('organization_code', ''),
|
'organization_code': detail.get('organization_code', ''),
|
||||||
|
|||||||
4390
demo/aiqicha-detail-logined.html
Normal file
4390
demo/aiqicha-detail-logined.html
Normal file
File diff suppressed because one or more lines are too long
4465
demo/aiqicha-detail2-vip.html
Normal file
4465
demo/aiqicha-detail2-vip.html
Normal file
File diff suppressed because one or more lines are too long
@@ -50,6 +50,47 @@ class AiqichaDetailCrawler:
|
|||||||
finally:
|
finally:
|
||||||
self.browser_started = False
|
self.browser_started = False
|
||||||
|
|
||||||
|
def close_svip_popups(self):
|
||||||
|
"""
|
||||||
|
关闭页面中的SVIP弹窗
|
||||||
|
"""
|
||||||
|
# dialog-close
|
||||||
|
try:
|
||||||
|
# 查找所有对话框关闭按钮元素
|
||||||
|
close_buttons = self.browser.page.query_selector_all('div.dialog-close')
|
||||||
|
print(f'找到 {len(close_buttons)} 个对话框关闭按钮')
|
||||||
|
|
||||||
|
# 遍历所有关闭按钮,点击可见的按钮
|
||||||
|
for i, button in enumerate(close_buttons):
|
||||||
|
if button and button.is_visible():
|
||||||
|
print(f"点击第 {i + 1} 个可见的对话框关闭按钮")
|
||||||
|
button.click()
|
||||||
|
time.sleep(1) # 等待对话框关闭
|
||||||
|
|
||||||
|
print("已完成关闭可见的对话框")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"关闭对话框时出错: {e}")
|
||||||
|
|
||||||
|
if True:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 查找所有关闭按钮元素
|
||||||
|
# <a class="ivu-modal-close"><i class="ivu-icon ivu-icon-ios-close">
|
||||||
|
close_buttons = self.browser.page.query_selector_all('a.ivu-modal-close')
|
||||||
|
print(f'找到 {len(close_buttons)} 个关闭按钮')
|
||||||
|
|
||||||
|
# 遍历所有关闭按钮,点击可见的按钮
|
||||||
|
for i, button in enumerate(close_buttons):
|
||||||
|
if button and button.is_visible():
|
||||||
|
print(f"点击第 {i + 1} 个可见的关闭按钮")
|
||||||
|
button.click()
|
||||||
|
time.sleep(1) # 等待弹窗关闭
|
||||||
|
|
||||||
|
print("已完成关闭可见的模态框")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"关闭弹窗时出错: {e}")
|
||||||
|
|
||||||
def crawl_company_detail(self, url: str, refer_url: str = None):
|
def crawl_company_detail(self, url: str, refer_url: str = None):
|
||||||
"""
|
"""
|
||||||
爬取爱企查企业详情页数据
|
爬取爱企查企业详情页数据
|
||||||
@@ -76,39 +117,48 @@ class AiqichaDetailCrawler:
|
|||||||
|
|
||||||
# 访问页面
|
# 访问页面
|
||||||
if self.browser.visit_page(url):
|
if self.browser.visit_page(url):
|
||||||
|
|
||||||
|
self.close_svip_popups()
|
||||||
|
# 保存当前页面的html 到 demo/html/aiqicha-datail.html
|
||||||
|
# 目录不存在时创建
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.browser.page.wait_for_selector('.header-user-center', timeout=10000)
|
||||||
|
print(f".header-user-center1: 等待页面元素ok")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f".header-user-center1: 等待页面元素时出错: {e}")
|
||||||
|
|
||||||
|
self.browser.save_page_html("demo/html/aiqicha-datail.html")
|
||||||
|
try:
|
||||||
# 增强页面加载检查
|
# 增强页面加载检查
|
||||||
|
# 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素)
|
||||||
|
self.close_svip_popups()
|
||||||
|
self.browser.save_page_html("demo/html/aiqicha-datail-after-close_svip_popups.html")
|
||||||
|
|
||||||
|
# 使用登录管理器检测登录状态
|
||||||
|
login = self.login_manager.check_and_login()
|
||||||
|
if login:
|
||||||
|
print("crawl_company_detail:登录成功")
|
||||||
|
else:
|
||||||
|
print("crawl_company_detail:登录失败")
|
||||||
|
|
||||||
|
self.browser.save_page_html("demo/html/aiqicha-datail-afterchecklogin.html")
|
||||||
|
# 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素)
|
||||||
|
self.close_svip_popups()
|
||||||
|
self.browser.save_page_html("demo/html/aiqicha-datail-after-close_svip_popups2.html")
|
||||||
|
|
||||||
|
# 等待关键元素加载,增加超时时间
|
||||||
|
|
||||||
print("等待页面关键元素加载...")
|
print("等待页面关键元素加载...")
|
||||||
try:
|
|
||||||
# 等待关键元素加载,增加超时时间
|
|
||||||
try:
|
try:
|
||||||
self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
|
self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
|
||||||
|
print("crawl_company_detail:.addr-enter-bg-ele success")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"等待页面元素时出错: {e}")
|
print(f"等待页面元素时出错: {e}")
|
||||||
# self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
|
# self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
|
||||||
print("关键元素已加载")
|
print("关键元素已加载")
|
||||||
|
|
||||||
# 使用登录管理器检测登录状态
|
|
||||||
logined = self.login_manager.check_and_login()
|
|
||||||
if logined:
|
|
||||||
print("登录成功")
|
|
||||||
else:
|
|
||||||
print("登录失败")
|
|
||||||
|
|
||||||
# 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素)
|
|
||||||
try:
|
|
||||||
# 查找并点击关闭按钮
|
|
||||||
svip_modals = self.browser.page.query_selector_all('.clue-card-wrap')
|
|
||||||
for modal in svip_modals:
|
|
||||||
if modal:
|
|
||||||
# 查找.ivu-modal-close关闭按钮
|
|
||||||
close_button = modal.query_selector('a.ivu-modal-close')
|
|
||||||
if close_button:
|
|
||||||
close_button.click()
|
|
||||||
print("已关闭SVIP弹窗")
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
print(f"关闭弹窗时出错: {e}")
|
|
||||||
|
|
||||||
# 额外等待一段时间确保页面完全加载
|
# 额外等待一段时间确保页面完全加载
|
||||||
import time
|
import time
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|||||||
@@ -140,6 +140,13 @@ class AiqichaDetailParser:
|
|||||||
text = re.sub(r'\s*查看地图.*$', '', text)
|
text = re.sub(r'\s*查看地图.*$', '', text)
|
||||||
text = re.sub(r'\s*附近企业.*$', '', text)
|
text = re.sub(r'\s*附近企业.*$', '', text)
|
||||||
|
|
||||||
|
# 特殊处理参保人数字段,仅保留数字
|
||||||
|
if field_text == "参保人数":
|
||||||
|
# 提取数字部分,如"7人" -> "7"
|
||||||
|
match = re.search(r'(\d+)', text)
|
||||||
|
if match:
|
||||||
|
text = match.group(1)
|
||||||
|
|
||||||
text = self._clean_text(text)
|
text = self._clean_text(text)
|
||||||
if text:
|
if text:
|
||||||
return text
|
return text
|
||||||
|
|||||||
@@ -19,6 +19,18 @@ class AiqichaLoginManager:
|
|||||||
# 判断当前的url地址 如果是 https://aiqicha.baidu.com/company_detail_* 地址
|
# 判断当前的url地址 如果是 https://aiqicha.baidu.com/company_detail_* 地址
|
||||||
if "company_detail" in self.browser.page.url:
|
if "company_detail" in self.browser.page.url:
|
||||||
# 等待页面加载完成
|
# 等待页面加载完成
|
||||||
|
# .header-user-center-menu
|
||||||
|
try:
|
||||||
|
# 检测用户中心元素判断已登录
|
||||||
|
self.browser.page.wait_for_selector('.header-user-center-menu', timeout=30000)
|
||||||
|
self.browser.save_cookies()
|
||||||
|
print("header-user-center-menu 检测到已登录状态")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print("header-user-center-menu 登录状态检测异常", e)
|
||||||
|
# return False
|
||||||
|
|
||||||
|
|
||||||
self.browser.page.wait_for_load_state("networkidle")
|
self.browser.page.wait_for_load_state("networkidle")
|
||||||
|
|
||||||
# 验证登录状态
|
# 验证登录状态
|
||||||
|
|||||||
@@ -10,6 +10,13 @@ from playwright.async_api import async_playwright
|
|||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
|
|
||||||
|
def create_directory(directory_path):
|
||||||
|
"""创建目录(如果不存在)"""
|
||||||
|
if not os.path.exists(directory_path):
|
||||||
|
os.makedirs(directory_path)
|
||||||
|
print(f"已创建目录: {directory_path}")
|
||||||
|
|
||||||
|
|
||||||
class WebBrowser:
|
class WebBrowser:
|
||||||
def __init__(self, cookie_path="browser_cookies.json"):
|
def __init__(self, cookie_path="browser_cookies.json"):
|
||||||
self.cookie_path = cookie_path
|
self.cookie_path = cookie_path
|
||||||
@@ -280,6 +287,20 @@ class WebBrowser:
|
|||||||
if self.playwright:
|
if self.playwright:
|
||||||
self.playwright.stop()
|
self.playwright.stop()
|
||||||
|
|
||||||
|
def save_page_html(self, filepath):
|
||||||
|
"""保存当前页面HTML到文件"""
|
||||||
|
try:
|
||||||
|
# 获取页面HTML内容
|
||||||
|
html_content = self.page.content()
|
||||||
|
# 确保目录存在
|
||||||
|
create_directory(os.path.dirname(filepath))
|
||||||
|
# 写入文件
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(html_content)
|
||||||
|
print(f"页面HTML已保存到: {filepath}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"保存页面HTML失败: {e}")
|
||||||
|
|
||||||
def visit_page(self, url):
|
def visit_page(self, url):
|
||||||
"""访问指定页面"""
|
"""访问指定页面"""
|
||||||
try:
|
try:
|
||||||
@@ -321,7 +342,15 @@ class WebBrowser:
|
|||||||
delete navigator.__proto__.webdriver;
|
delete navigator.__proto__.webdriver;
|
||||||
""")
|
""")
|
||||||
|
|
||||||
self.page.wait_for_load_state("networkidle")
|
# self.page.wait_for_load_state("networkidle")
|
||||||
|
# 3. 等待页面加载状态而不是特定元素
|
||||||
|
try:
|
||||||
|
self.page.wait_for_load_state('networkidle', timeout=5000)
|
||||||
|
print("networkidle, timeout=5000页面已加载")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"等待页面加载状态时出错: {e}")
|
||||||
|
# self.page.wait_for_load_state('networkidle', timeout=5000)
|
||||||
|
|
||||||
self.human_like_actions()
|
self.human_like_actions()
|
||||||
self.random_behavior()
|
self.random_behavior()
|
||||||
return True
|
return True
|
||||||
|
|||||||
Reference in New Issue
Block a user