139 lines
5.2 KiB
Python
139 lines
5.2 KiB
Python
# -*- coding: utf-8 -*-
|
||
import asyncio
|
||
import random
|
||
|
||
import aiohttp
|
||
import time
|
||
import sys
|
||
from bs4 import BeautifulSoup
|
||
import re
|
||
import aiofiles
|
||
import urllib.parse
|
||
import argparse
|
||
from colorama import init, Fore
|
||
from search import Bing,Baidu
|
||
import openpyxl
|
||
import ssl
|
||
|
||
from tool.read_csv import CSVReader
|
||
|
||
start = time.time()
|
||
def printascii():
|
||
# 初始化
|
||
init()
|
||
# 设置颜色
|
||
print(Fore.GREEN + r'''
|
||
____ _
|
||
/ ___| ___ __ _ _ __ ___| |__ ___ _ __
|
||
\___ \ / _ \/ _` | '__/ __| '_ \ / _ \ '__|
|
||
___) | __/ (_| | | | (__| | | | __/ |
|
||
|____/ \___|\__,_|_| \___|_| |_|\___|_|
|
||
''' + Fore.RESET)
|
||
# 天欣安全实验室
|
||
|
||
|
||
def writeExcel(titles, links,ws):
|
||
infos = list(zip(titles, links))
|
||
for row in infos:
|
||
ws.append(row)
|
||
|
||
def create_sheet_and_write(wb, engine, keywords, num, title):
|
||
ws = wb.create_sheet(title=title)
|
||
result = engine(keywords, num)
|
||
writeExcel(result[0], result[1], ws)
|
||
def excel_text2url(link_url): #如果函数内部没有进行异步操作,使用 async 并不会对性能或功能产生实际影响。
|
||
'''把一个网址字符串转换为 Excel公式,使其可以点击直接转跳'''
|
||
return f'=HYPERLINK("{link_url}","{link_url}")'
|
||
# 遍历所有工作表,并将第二列的所有数据传递给 excel_text2url 函数重新赋值
|
||
def update_hyperlinks(wb):
|
||
for sheet in wb.worksheets: # 遍历每一个工作表
|
||
for row in sheet.iter_rows(min_row=1, max_row=sheet.max_row, min_col=2, max_col=2): # 遍历第二列
|
||
for cell in row:
|
||
if cell.value: # 检查单元格是否有内容
|
||
cell.value = excel_text2url(cell.value) # 将网址转换为超链接公式
|
||
else:
|
||
break
|
||
|
||
def commend():
|
||
parser = argparse.ArgumentParser(prog="Searcher", description='此工具用于对百度、必应和谷歌搜索的协程爬取--天欣安全实验室', usage='please read -h')
|
||
parser.add_argument("-k", type=str, help="搜索的关键词", nargs='+')
|
||
# 添加一个positional arguments,叫a,读取类型为int(默认是字符串)
|
||
parser.add_argument("-p", type=str, help="需要搜索页数,默认为5,支持范围搜索,例如搜索从第2页到第五页的参数为 2:5", default='5')
|
||
parser.add_argument("-m", type=str, help="使用的搜索引擎:百度:bd,必应:bin,谷歌:goo 不填写默认使用全部", default='all',nargs='+')
|
||
# parser.add_argument("-t", '--task', type=int, help="设置的线程,默认为8", default=8)
|
||
parser.exit_on_error = False
|
||
args = parser.parse_args()
|
||
if len(sys.argv) == 1:
|
||
printascii()
|
||
parser.print_help()
|
||
sys.exit()
|
||
return args
|
||
def search_company_info(company_name_arg, num):
|
||
keywords = company_name_arg
|
||
# for key in keyword:
|
||
# keywords = keywords + key + " "
|
||
keywords = keywords.strip()
|
||
result = Bing.bing_main(keywords, num)
|
||
|
||
# for 循环 遍历 result[0] 和 result[1]
|
||
|
||
data_list =[]
|
||
for i in range(len(result[0])):
|
||
title= result[0][i]
|
||
url = result[1][i]
|
||
print(f"必应搜索爬取结果为,title:{title}, url:{url}")
|
||
if re.match(r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*", url):
|
||
data_list.append([title, url])
|
||
return data_list
|
||
|
||
def filter_company_sites(urls):
|
||
# urls https://www.tianyancha.com/company/5226478758
|
||
# url:https://aiqicha.baidu.com/company_detail_26602790857925
|
||
# url:https://www.qcc.com/firm/05b449eb5cc417d0f97c14104051f5c0.html
|
||
# 匹配 前缀https://aiqicha.baidu.com/company_detail_*,https://www.qcc.com/firm/*.html,https://www.tianyancha.com/company/5226478758*
|
||
filtered_urls = [url for url in urls if re.match(r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*", url)]
|
||
return filtered_urls
|
||
|
||
|
||
def search_one_company(company_name_arg, num):
|
||
|
||
keywords = company_name_arg
|
||
# for key in keyword:
|
||
# keywords = keywords + key + " "
|
||
keywords = keywords.strip()
|
||
print(f"您搜索的关键词为:{keywords}")
|
||
wb = openpyxl.Workbook()
|
||
# 删除默认创建的工作表(现在名为 "数据表1")
|
||
wb.remove(wb['Sheet'])
|
||
printascii()
|
||
pattern = r"[\\/:\*\?\"<>|]"
|
||
keyword = re.sub(pattern, "", keywords)
|
||
create_sheet_and_write(wb, Bing.bing_main, keywords, num, "必应爬取结果")
|
||
create_sheet_and_write(wb, Baidu.baidu_main, keywords, num, "百度爬取结果")
|
||
# 将所有url变为超链接,点击即可打开转跳
|
||
update_hyperlinks(wb)
|
||
wb.save(f'./{keyword}-{company_name_arg}.xlsx')
|
||
print(Fore.GREEN + '总任务结束!' + Fore.RESET)
|
||
end = time.time()
|
||
print(Fore.RED + f'脚本总时间: {end - start:.2f}')
|
||
|
||
if __name__ == '__main__':
|
||
reader = CSVReader('data.csv')
|
||
company_names = reader.read_column(0, has_header=False)
|
||
print("所有数据:", company_names)
|
||
|
||
i= 1
|
||
for company_name in company_names:
|
||
sleep_time = 5
|
||
sleep_time += random.randint(1, 5)
|
||
time.sleep(sleep_time)
|
||
company_name += " 爱企查|企查查"
|
||
data_list = search_company_info(company_name, '1')
|
||
print(data_list)
|
||
i=i+1
|
||
if i > 1:
|
||
break
|
||
|
||
|
||
|