232 lines
7.7 KiB
Python
232 lines
7.7 KiB
Python
# -*- coding: utf-8 -*-
|
||
import asyncio
|
||
import csv
|
||
import os
|
||
import random
|
||
|
||
import aiohttp
|
||
import time
|
||
import sys
|
||
from bs4 import BeautifulSoup
|
||
import re
|
||
import aiofiles
|
||
import urllib.parse
|
||
import argparse
|
||
from colorama import init, Fore
|
||
from search import Bing,Baidu
|
||
import openpyxl
|
||
import ssl
|
||
|
||
from tool.bing_search import BingSearcher
|
||
from tool.csv_tool import CSVTool
|
||
from tool.read_csv import CSVReader
|
||
|
||
start = time.time()
|
||
def printascii():
|
||
# 初始化
|
||
init()
|
||
# 设置颜色
|
||
print(Fore.GREEN + r'''
|
||
____ _
|
||
/ ___| ___ __ _ _ __ ___| |__ ___ _ __
|
||
\___ \ / _ \/ _` | '__/ __| '_ \ / _ \ '__|
|
||
___) | __/ (_| | | | (__| | | | __/ |
|
||
|____/ \___|\__,_|_| \___|_| |_|\___|_|
|
||
''' + Fore.RESET)
|
||
|
||
|
||
def filter_aiqicha_qcc(search_result, company_name_, with_not_match = False):
|
||
datas = []
|
||
|
||
for i in range(len(search_result)):
|
||
data_node = search_result[i]
|
||
title = data_node['title']
|
||
url = data_node['url']
|
||
print(f"必应搜索爬取结果为,title:{title}, url:{url}")
|
||
|
||
# 判断title是否包含 company_name_
|
||
# if re.match(
|
||
# r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*",
|
||
# url) and title.find(company_name_) != -1:
|
||
if title.find(company_name_) != -1 or with_not_match:
|
||
web_site_type = None
|
||
if re.match(r"^https://aiqicha.baidu.com/company_detail_.*", url):
|
||
web_site_type = "aiqicha"
|
||
elif re.match(r"^https://www.tianyancha.com/company/.*", url):
|
||
web_site_type = "tianyancha"
|
||
elif re.match(r"^https://www.qcc.com/firm/.*", url):
|
||
web_site_type = "qcc"
|
||
|
||
if web_site_type is not None:
|
||
data_node['web_site_type'] = web_site_type
|
||
data_node['company_name'] = company_name_
|
||
datas.append(data_node)
|
||
return datas
|
||
|
||
|
||
def check_company_exists(company_names, type_list):
|
||
"""
|
||
读取 company_search_result_data.csv 数据,检查指定的公司名称和类型是否存在
|
||
|
||
Args:
|
||
company_names (list): 公司名称列表
|
||
type_list (list): 类型列表
|
||
|
||
Returns:
|
||
list: 包含公司名称和存在状态的字典列表
|
||
格式: [{"company_name": "公司名", "exists": True/False}, ...]
|
||
"""
|
||
csv_file = 'company_search_result_data.csv'
|
||
result = []
|
||
|
||
# 初始化所有公司为不存在状态
|
||
for company_name_item in company_names:
|
||
result.append({
|
||
"company_name": company_name_item,
|
||
"exists": False
|
||
})
|
||
|
||
# 如果文件不存在,直接返回初始化结果
|
||
if not os.path.exists(csv_file):
|
||
return result
|
||
|
||
# 读取CSV文件中的现有数据
|
||
existing_combinations = set() # 存储(公司名, 类型)组合
|
||
try:
|
||
with open(csv_file, 'r', encoding='utf-8') as f:
|
||
reader = csv.reader(f)
|
||
header_skipped = False
|
||
|
||
for row in reader:
|
||
if not header_skipped:
|
||
header_skipped = True
|
||
continue
|
||
|
||
# 确保行数据完整
|
||
if len(row) >= 5:
|
||
company_name_item = row[4] # company_name在第5列(索引4)
|
||
web_site_type = row[2] if len(row) > 2 else "" # web_site_type在第3列(索引2)
|
||
|
||
# 添加到现有组合集合中
|
||
existing_combinations.add((company_name_item, web_site_type))
|
||
except Exception as e:
|
||
print(f"读取CSV文件时出错: {e}")
|
||
return result
|
||
|
||
# 检查每个公司是否存在于指定的类型中
|
||
for item in result:
|
||
company_name_item = item["company_name"]
|
||
exists = False
|
||
|
||
# 如果type_list为空,检查公司是否存在任何类型中
|
||
if not type_list:
|
||
for existing_company, _ in existing_combinations:
|
||
if existing_company == company_name_item:
|
||
exists = True
|
||
break
|
||
else:
|
||
# 检查公司是否存在于指定的类型中
|
||
for web_site_type in type_list:
|
||
if (company_name_item, web_site_type) in existing_combinations:
|
||
exists = True
|
||
break
|
||
|
||
item["exists"] = exists
|
||
|
||
return result
|
||
|
||
|
||
|
||
if __name__ == '__main__':
|
||
reader = CSVReader('data/data.csv')
|
||
company_names = reader.read_column(0, has_header=False)
|
||
print("所有数据:", company_names)
|
||
|
||
# 检查已存在的公司
|
||
type_list = ["aiqicha", "qcc", "tianyancha"]
|
||
check_result = check_company_exists(company_names, type_list)
|
||
print("检查结果:", check_result)
|
||
i = 1
|
||
# 方法2: 使用上下文管理器
|
||
with BingSearcher() as searcher:
|
||
# 创建CSV工具实例
|
||
csv_tool = CSVTool(
|
||
csv_file_name='data/company_search_bing_data.csv',
|
||
headers=['title', 'url', 'web_site_type', 'request_url', 'company_name', 'create_time']
|
||
)
|
||
# 查询所有数据
|
||
all_data = csv_tool.get_all_data()
|
||
print("所有数据:", all_data)
|
||
# 查询所有数据
|
||
all_data = csv_tool.get_all_data()
|
||
print("所有数据:", all_data)
|
||
|
||
# 初始化所有公司为不存在状态
|
||
company_names_saved_set = set()
|
||
for company_name_item in all_data:
|
||
company_names_saved_set.add(company_name_item["company_name"])
|
||
|
||
|
||
for company_name in company_names:
|
||
|
||
# 如果公司已存在,跳过处理
|
||
if company_name in company_names_saved_set:
|
||
print(f"公司 {company_name} 已存在,跳过处理")
|
||
continue
|
||
# if company_exists:
|
||
# print(f"公司 {company_name} 已存在,跳过处理")
|
||
# continue
|
||
|
||
print(f"正在处理第 {i} 个公司: {company_name}")
|
||
addon_args = " 爱企查|企查查"
|
||
data_list = searcher.search(company_name+" "+addon_args, 1)
|
||
filter_list = filter_aiqicha_qcc(data_list, company_name)
|
||
print(company_name, "filter_list:", filter_list)
|
||
|
||
|
||
if len(filter_list) <= 0:
|
||
print("没有数据 filter_list is empty. " + company_name)
|
||
|
||
filter_list_with_not_match = filter_aiqicha_qcc(data_list, company_name, with_not_match=True)
|
||
# 创建CSV工具实例
|
||
csv_tool = CSVTool(
|
||
csv_file_name='data/company_search_filter_is_none_data.csv',
|
||
headers=['company_name','title', 'web_site_type','url', 'request_url', 'create_time']
|
||
)
|
||
|
||
# 保存数据,指定去重字段
|
||
csv_tool.save_data(filter_list_with_not_match, unique_titles=['company_name', 'title','url','web_site_type'])
|
||
|
||
continue
|
||
else:
|
||
# 创建CSV工具实例
|
||
csv_tool = CSVTool(
|
||
csv_file_name='data/company_search_bing_data.csv',
|
||
headers=['company_name','title', 'web_site_type','url', 'request_url', 'create_time']
|
||
)
|
||
|
||
# 保存数据,指定去重字段
|
||
csv_tool.save_data(filter_list,
|
||
unique_titles=['company_name', 'web_site_type'])
|
||
# save_to_csv(filter_list)
|
||
|
||
# i = i + 1
|
||
# if i > 3:
|
||
# print("结束循环")
|
||
# break
|
||
# results2 = searcher.search("腾讯", 1)
|
||
# results3 = searcher.search("百度", 1)
|
||
|
||
sleep_time = 3
|
||
sleep_time += random.randint(1, 2)
|
||
time.sleep(sleep_time)
|
||
pass
|
||
pass
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|