# -*- coding: utf-8 -*- import asyncio import os import sys import urllib.parse from urllib.parse import quote import aiohttp from bs4 import BeautifulSoup from colorama import Fore # 添加项目根目录到 sys.path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import config bingheaders=config.bingheaders proxy=config.proxy timeout = aiohttp.ClientTimeout( total=None, # 总超时 sock_connect=5.5, # 连接超时时间5.5 sock_read=5.5 # 读取超时为5.5秒 ) async def getbing(url, session): url_list = [] title_list = [] async with session.get(url, headers=bingheaders,timeout=timeout) as resp: # print("正在爬取url:"+url) try: a = await resp.text() soup = BeautifulSoup(a, 'lxml') h2a = soup.select('h2 a') for h in h2a: htext = h.text.replace('\n', '').replace(',', ' ').strip() hurl=h.get('href') if not hurl.startswith(('http://', 'https://')): domain = 'https://cn.bing.com/' hurl = urllib.parse.urljoin(domain, hurl) print(htext," ",hurl) title_list.append(htext) url_list.append(hurl) except: print(f"必应页面爬取失败,{url}该url无法正常获取数据。") return [],[] return url_list, title_list async def bing_spinder(keyword, num): print(f'必应爬取任务进行中,爬取页数为{num}...') print('标题 url') urllist = [] titlelist = [] tasks = [] if ':' in num: if num.count(':') > 1: raise ValueError("输入中必须且只能包含一个 ':'") else: # 分割字符串,确保分割后的两部分都是数字 start_page, end_page = num.split(':') # 判断两边是否都是数字 if not (start_page.isdigit() and end_page.isdigit()): raise ValueError("':' 两侧的值必须是数字") else: start_page = (int(start_page)-1)*10 end_page = (int(end_page))*10 else: start_page, end_page =0,int(num) * 10 async with aiohttp.ClientSession() as session: for pn in range(start_page, end_page, 10): #url = f'https://cn.bing.com/search?q={keyword}&first={pn}&mkt=zh-CN' url = f'https://cn.bing.com/search?q={keyword}&qs=n&form=QBRE&sp=-1&lq=0' # print("正在爬取的url为:"+url) tasks = tasks + [asyncio.create_task(getbing(url, session))] result = await asyncio.gather(*tasks) for i in range(int((end_page-start_page) / 10)): urllist += result[i][0] titlelist += result[i][1] count=len(urllist) print(f"必应搜索爬取结果为{count}") print(Fore.GREEN + '必应爬取任务完成\n' + Fore.RESET) return titlelist, urllist # await bingwriteCSV(titlelist, urllist, keyword) def bing_main(keyword,num): keyword = quote(keyword) if sys.platform.startswith('win'): asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) loop = asyncio.get_event_loop() return loop.run_until_complete(bing_spinder(keyword,num)) async def Bing_main(keywords, num): return await bing_spinder(keywords, num)