Files
SearchCompany/crawler_bing_main.py
2025-11-13 07:28:15 +08:00

232 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import asyncio
import csv
import os
import random
import aiohttp
import time
import sys
from bs4 import BeautifulSoup
import re
import aiofiles
import urllib.parse
import argparse
from colorama import init, Fore
from search import Bing,Baidu
import openpyxl
import ssl
from tool.bing_search import BingSearcher
from tool.csv_tool import CSVTool
from tool.read_csv import CSVReader
start = time.time()
def printascii():
# 初始化
init()
# 设置颜色
print(Fore.GREEN + r'''
____ _
/ ___| ___ __ _ _ __ ___| |__ ___ _ __
\___ \ / _ \/ _` | '__/ __| '_ \ / _ \ '__|
___) | __/ (_| | | | (__| | | | __/ |
|____/ \___|\__,_|_| \___|_| |_|\___|_|
''' + Fore.RESET)
def filter_aiqicha_qcc(search_result, company_name_, with_not_match = False):
datas = []
for i in range(len(search_result)):
data_node = search_result[i]
title = data_node['title']
url = data_node['url']
print(f"必应搜索爬取结果为,title:{title}, url:{url}")
# 判断title是否包含 company_name_
# if re.match(
# r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*",
# url) and title.find(company_name_) != -1:
if title.find(company_name_) != -1 or with_not_match:
web_site_type = None
if re.match(r"^https://aiqicha.baidu.com/company_detail_.*", url):
web_site_type = "aiqicha"
elif re.match(r"^https://www.tianyancha.com/company/.*", url):
web_site_type = "tianyancha"
elif re.match(r"^https://www.qcc.com/firm/.*", url):
web_site_type = "qcc"
if web_site_type is not None:
data_node['web_site_type'] = web_site_type
data_node['company_name'] = company_name_
datas.append(data_node)
return datas
def check_company_exists(company_names, type_list):
"""
读取 company_search_result_data.csv 数据,检查指定的公司名称和类型是否存在
Args:
company_names (list): 公司名称列表
type_list (list): 类型列表
Returns:
list: 包含公司名称和存在状态的字典列表
格式: [{"company_name": "公司名", "exists": True/False}, ...]
"""
csv_file = 'company_search_result_data.csv'
result = []
# 初始化所有公司为不存在状态
for company_name_item in company_names:
result.append({
"company_name": company_name_item,
"exists": False
})
# 如果文件不存在,直接返回初始化结果
if not os.path.exists(csv_file):
return result
# 读取CSV文件中的现有数据
existing_combinations = set() # 存储(公司名, 类型)组合
try:
with open(csv_file, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
header_skipped = False
for row in reader:
if not header_skipped:
header_skipped = True
continue
# 确保行数据完整
if len(row) >= 5:
company_name_item = row[4] # company_name在第5列(索引4)
web_site_type = row[2] if len(row) > 2 else "" # web_site_type在第3列(索引2)
# 添加到现有组合集合中
existing_combinations.add((company_name_item, web_site_type))
except Exception as e:
print(f"读取CSV文件时出错: {e}")
return result
# 检查每个公司是否存在于指定的类型中
for item in result:
company_name_item = item["company_name"]
exists = False
# 如果type_list为空检查公司是否存在任何类型中
if not type_list:
for existing_company, _ in existing_combinations:
if existing_company == company_name_item:
exists = True
break
else:
# 检查公司是否存在于指定的类型中
for web_site_type in type_list:
if (company_name_item, web_site_type) in existing_combinations:
exists = True
break
item["exists"] = exists
return result
if __name__ == '__main__':
reader = CSVReader('data/data.csv')
company_names = reader.read_column(0, has_header=False)
print("所有数据:", company_names)
# 检查已存在的公司
type_list = ["aiqicha", "qcc", "tianyancha"]
check_result = check_company_exists(company_names, type_list)
print("检查结果:", check_result)
i = 1
# 方法2: 使用上下文管理器
with BingSearcher() as searcher:
# 创建CSV工具实例
csv_tool = CSVTool(
csv_file_name='data/company_search_bing_data.csv',
headers=['title', 'url', 'web_site_type', 'request_url', 'company_name', 'create_time']
)
# 查询所有数据
all_data = csv_tool.get_all_data()
print("所有数据:", all_data)
# 查询所有数据
all_data = csv_tool.get_all_data()
print("所有数据:", all_data)
# 初始化所有公司为不存在状态
company_names_saved_set = set()
for company_name_item in all_data:
company_names_saved_set.add(company_name_item["company_name"])
for company_name in company_names:
# 如果公司已存在,跳过处理
if company_name in company_names_saved_set:
print(f"公司 {company_name} 已存在,跳过处理")
continue
# if company_exists:
# print(f"公司 {company_name} 已存在,跳过处理")
# continue
print(f"正在处理第 {i} 个公司: {company_name}")
addon_args = " 爱企查|企查查"
data_list = searcher.search(company_name+" "+addon_args, 1)
filter_list = filter_aiqicha_qcc(data_list, company_name)
print(company_name, "filter_list:", filter_list)
if len(filter_list) <= 0:
print("没有数据 filter_list is empty. " + company_name)
filter_list_with_not_match = filter_aiqicha_qcc(data_list, company_name, with_not_match=True)
# 创建CSV工具实例
csv_tool = CSVTool(
csv_file_name='data/company_search_filter_is_none_data.csv',
headers=['company_name','title', 'web_site_type','url', 'request_url', 'create_time']
)
# 保存数据,指定去重字段
csv_tool.save_data(filter_list_with_not_match, unique_titles=['company_name', 'title','url','web_site_type'])
continue
else:
# 创建CSV工具实例
csv_tool = CSVTool(
csv_file_name='data/company_search_bing_data.csv',
headers=['company_name','title', 'web_site_type','url', 'request_url', 'create_time']
)
# 保存数据,指定去重字段
csv_tool.save_data(filter_list,
unique_titles=['company_name', 'web_site_type'])
# save_to_csv(filter_list)
# i = i + 1
# if i > 3:
# print("结束循环")
# break
# results2 = searcher.search("腾讯", 1)
# results3 = searcher.search("百度", 1)
sleep_time = 3
sleep_time += random.randint(1, 2)
time.sleep(sleep_time)
pass
pass