Files
SearchCompany/search/Bing.py
manchuwork 9d0f18a121 cookies
2025-09-05 16:51:46 +08:00

91 lines
3.3 KiB
Python

# -*- coding: utf-8 -*-
import asyncio
import os
import sys
import urllib.parse
from urllib.parse import quote
import aiohttp
from bs4 import BeautifulSoup
from colorama import Fore
# 添加项目根目录到 sys.path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import config
bingheaders=config.bingheaders
proxy=config.proxy
timeout = aiohttp.ClientTimeout(
total=None, # 总超时
sock_connect=5.5, # 连接超时时间5.5
sock_read=5.5 # 读取超时为5.5秒
)
async def getbing(url, session):
url_list = []
title_list = []
async with session.get(url, headers=bingheaders,timeout=timeout) as resp:
# print("正在爬取url:"+url)
try:
a = await resp.text()
soup = BeautifulSoup(a, 'lxml')
h2a = soup.select('h2 a')
for h in h2a:
htext = h.text.replace('\n', '').replace(',', ' ').strip()
hurl=h.get('href')
if not hurl.startswith(('http://', 'https://')):
domain = 'https://cn.bing.com/'
hurl = urllib.parse.urljoin(domain, hurl)
print(htext," ",hurl)
title_list.append(htext)
url_list.append(hurl)
except:
print(f"必应页面爬取失败,{url}该url无法正常获取数据。")
return [],[]
return url_list, title_list
async def bing_spinder(keyword, num):
print(f'必应爬取任务进行中,爬取页数为{num}...')
print('标题 url')
urllist = []
titlelist = []
tasks = []
if ':' in num:
if num.count(':') > 1:
raise ValueError("输入中必须且只能包含一个 ':'")
else:
# 分割字符串,确保分割后的两部分都是数字
start_page, end_page = num.split(':')
# 判断两边是否都是数字
if not (start_page.isdigit() and end_page.isdigit()):
raise ValueError("':' 两侧的值必须是数字")
else:
start_page = (int(start_page)-1)*10
end_page = (int(end_page))*10
else:
start_page, end_page =0,int(num) * 10
async with aiohttp.ClientSession() as session:
for pn in range(start_page, end_page, 10):
#url = f'https://cn.bing.com/search?q={keyword}&first={pn}&mkt=zh-CN'
url = f'https://cn.bing.com/search?q={keyword}&qs=n&form=QBRE&sp=-1&lq=0'
# print("正在爬取的url为:"+url)
tasks = tasks + [asyncio.create_task(getbing(url, session))]
result = await asyncio.gather(*tasks)
for i in range(int((end_page-start_page) / 10)):
urllist += result[i][0]
titlelist += result[i][1]
count=len(urllist)
print(f"必应搜索爬取结果为{count}")
print(Fore.GREEN + '必应爬取任务完成\n' + Fore.RESET)
return titlelist, urllist
# await bingwriteCSV(titlelist, urllist, keyword)
def bing_main(keyword,num):
keyword = quote(keyword)
if sys.platform.startswith('win'):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
loop = asyncio.get_event_loop()
return loop.run_until_complete(bing_spinder(keyword,num))
async def Bing_main(keywords, num):
return await bing_spinder(keywords, num)