# -*- coding: utf-8 -*- import asyncio import csv import os import random import aiohttp import time import sys from bs4 import BeautifulSoup import re import aiofiles import urllib.parse import argparse from colorama import init, Fore from search import Bing,Baidu import openpyxl import ssl from tool.bing_search import BingSearcher from tool.csv_tool import CSVTool from tool.read_csv import CSVReader start = time.time() def printascii(): # 初始化 init() # 设置颜色 print(Fore.GREEN + r''' ____ _ / ___| ___ __ _ _ __ ___| |__ ___ _ __ \___ \ / _ \/ _` | '__/ __| '_ \ / _ \ '__| ___) | __/ (_| | | | (__| | | | __/ | |____/ \___|\__,_|_| \___|_| |_|\___|_| ''' + Fore.RESET) def filter_aiqicha_qcc(search_result, company_name_, with_not_match = False): datas = [] for i in range(len(search_result)): data_node = search_result[i] title = data_node['title'] url = data_node['url'] print(f"必应搜索爬取结果为,title:{title}, url:{url}") # 判断title是否包含 company_name_ # if re.match( # r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*", # url) and title.find(company_name_) != -1: if title.find(company_name_) != -1 or with_not_match: web_site_type = None if re.match(r"^https://aiqicha.baidu.com/company_detail_.*", url): web_site_type = "aiqicha" elif re.match(r"^https://www.tianyancha.com/company/.*", url): web_site_type = "tianyancha" elif re.match(r"^https://www.qcc.com/firm/.*", url): web_site_type = "qcc" if web_site_type is not None: data_node['web_site_type'] = web_site_type data_node['company_name'] = company_name_ datas.append(data_node) return datas def check_company_exists(company_names, type_list): """ 读取 company_search_result_data.csv 数据,检查指定的公司名称和类型是否存在 Args: company_names (list): 公司名称列表 type_list (list): 类型列表 Returns: list: 包含公司名称和存在状态的字典列表 格式: [{"company_name": "公司名", "exists": True/False}, ...] """ csv_file = 'company_search_result_data.csv' result = [] # 初始化所有公司为不存在状态 for company_name_item in company_names: result.append({ "company_name": company_name_item, "exists": False }) # 如果文件不存在,直接返回初始化结果 if not os.path.exists(csv_file): return result # 读取CSV文件中的现有数据 existing_combinations = set() # 存储(公司名, 类型)组合 try: with open(csv_file, 'r', encoding='utf-8') as f: reader = csv.reader(f) header_skipped = False for row in reader: if not header_skipped: header_skipped = True continue # 确保行数据完整 if len(row) >= 5: company_name_item = row[4] # company_name在第5列(索引4) web_site_type = row[2] if len(row) > 2 else "" # web_site_type在第3列(索引2) # 添加到现有组合集合中 existing_combinations.add((company_name_item, web_site_type)) except Exception as e: print(f"读取CSV文件时出错: {e}") return result # 检查每个公司是否存在于指定的类型中 for item in result: company_name_item = item["company_name"] exists = False # 如果type_list为空,检查公司是否存在任何类型中 if not type_list: for existing_company, _ in existing_combinations: if existing_company == company_name_item: exists = True break else: # 检查公司是否存在于指定的类型中 for web_site_type in type_list: if (company_name_item, web_site_type) in existing_combinations: exists = True break item["exists"] = exists return result if __name__ == '__main__': reader = CSVReader('data/data.csv') company_names = reader.read_column(0, has_header=False) print("所有数据:", company_names) # 检查已存在的公司 type_list = ["aiqicha", "qcc", "tianyancha"] check_result = check_company_exists(company_names, type_list) print("检查结果:", check_result) i = 1 # 方法2: 使用上下文管理器 with BingSearcher() as searcher: # 创建CSV工具实例 csv_tool = CSVTool( csv_file_name='data/company_search_bing_data.csv', headers=['title', 'url', 'web_site_type', 'request_url', 'company_name', 'create_time'] ) # 查询所有数据 all_data = csv_tool.get_all_data() print("所有数据:", all_data) # 查询所有数据 all_data = csv_tool.get_all_data() print("所有数据:", all_data) # 初始化所有公司为不存在状态 company_names_saved_set = set() for company_name_item in all_data: company_names_saved_set.add(company_name_item["company_name"]) for company_name in company_names: # 如果公司已存在,跳过处理 if company_name in company_names_saved_set: print(f"公司 {company_name} 已存在,跳过处理") continue # if company_exists: # print(f"公司 {company_name} 已存在,跳过处理") # continue print(f"正在处理第 {i} 个公司: {company_name}") addon_args = " 爱企查|企查查" data_list = searcher.search(company_name+" "+addon_args, 1) filter_list = filter_aiqicha_qcc(data_list, company_name) print(company_name, "filter_list:", filter_list) if len(filter_list) <= 0: print("没有数据 filter_list is empty. " + company_name) filter_list_with_not_match = filter_aiqicha_qcc(data_list, company_name, with_not_match=True) # 创建CSV工具实例 csv_tool = CSVTool( csv_file_name='data/company_search_filter_is_none_data.csv', headers=['company_name','title', 'web_site_type','url', 'request_url', 'create_time'] ) # 保存数据,指定去重字段 csv_tool.save_data(filter_list_with_not_match, unique_titles=['company_name', 'title','url','web_site_type']) continue else: # 创建CSV工具实例 csv_tool = CSVTool( csv_file_name='data/company_search_bing_data.csv', headers=['company_name','title', 'web_site_type','url', 'request_url', 'create_time'] ) # 保存数据,指定去重字段 csv_tool.save_data(filter_list, unique_titles=['company_name', 'web_site_type']) # save_to_csv(filter_list) # i = i + 1 # if i > 3: # print("结束循环") # break # results2 = searcher.search("腾讯", 1) # results3 = searcher.search("百度", 1) sleep_time = 3 sleep_time += random.randint(1, 2) time.sleep(sleep_time) pass pass