This commit is contained in:
best
2025-11-13 07:28:15 +08:00
parent 92c22841ee
commit a070bda18f
7 changed files with 547 additions and 214 deletions

View File

@@ -3,8 +3,9 @@
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" />
<excludeFolder url="file://$MODULE_DIR$/.venv1" />
</content>
<orderEntry type="jdk" jdkName="Python 3.13 virtualenv at D:\gitstudy\pythonwork\SearchCompany\.venv" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.13 (SearchCompany)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View File

@@ -3,5 +3,5 @@
<component name="Black">
<option name="sdkName" value="Python 3.13 (SearchCompany)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 virtualenv at D:\gitstudy\pythonwork\SearchCompany\.venv" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (SearchCompany)" project-jdk-type="Python SDK" />
</project>

View File

@@ -1,10 +1,21 @@
# 最好写入三家搜索引擎登录后的cookie
# bingheaders = {
# 'cookie': """""",
# 'referer': 'https://cn.bing.com/',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4051.0 Safari/537.36 Edg/82.0.425.0'}
# 确保 `bingheaders` 包含以下字段
bingheaders = {
'cookie': """""",
'referer': 'https://cn.bing.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4051.0 Safari/537.36 Edg/82.0.425.0'}
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://cn.bing.com/',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache',
}
baiduheaders = {
'Cookie': """PSTM=1755051967; BAIDUID=9623ABA6AF15935E519C6D57EB04D5BD:FG=1; BIDUPSID=BFDEAE9917763352A1CF94FF7A9AD50F; BD_UPN=12314753; delPer=0; BD_CK_SAM=1; PSINO=3; BAIDUID_BFESS=9623ABA6AF15935E519C6D57EB04D5BD:FG=1; ZFY=LX6tLiXJLyE8Spg0Tn3yWYhYWOqUXgNuD45NXzSsgDY:C; baikeVisitId=6e4f6130-a8eb-49b3-8413-1815a6af31a3; BD_HOME=1; ppfuid=FOCoIC3q5fKa8fgJnwzbE67EJ49BGJeplOzf+4l4EOvDuu2RXBRv6R3A1AZMa49I27C0gDDLrJyxcIIeAeEhD8JYsoLTpBiaCXhLqvzbzmvy3SeAW17tKgNq/Xx+RgOdb8TWCFe62MVrDTY6lMf2GrfqL8c87KLF2qFER3obJGlT/s3qQuIlmw0dmIvm22ZTGEimjy3MrXEpSuItnI4KDyGSNvJz3OVxhMd6l0BD7nHci+eNtO+sUfx41sINYk+w3il4JkBUe91yGyLjoc4piSRx4OH9u8PLj7EqnTyQEyOWgTqV0RFcOD/4ANUzZZkGhGlPjfasITJONp0AJTY8kGLSgWjlFVG9Xmh1+20oPSbrzvDjYtVPmZ+9/6evcXmhcO1Y58MgLozKnaQIaLfWRPAn9I0uOqAMff6fuUeWcH0OjH2+RiDANKDxQc+RdNr2uC5D1fu00TizBtFeq9APvs5FjnYxYstXg/9EfB3EVmJIvdK3BvFGk0IgcgSSzt63lV1Uhhp5FAe6gNJIUptp7EMAaXYKm11G+JVPszQFdp9AJLcm4YSsYUXkaPI2Tl66J246cmjWQDTahAOINR5rXR5r/7VVI1RMZ8gb40q7az7vCK56XLooKT5a+rsFrf5Zu0yyCiiagElhrTEOtNdBJJq8eHwEHuFBni9ahSwpC7lbKkUwaKH69tf0DFV7hJROiLETSFloIVkHdy3+I2JUr1LsplAz0hMkWt/tE4tXVUV7QcTDTZWS/2mCoS/GV3N9awQ6iM6hs/BWjlgnEa1+5gbcves5wJ6gbk0b0Avk9wGRtTVVEE/aHCSd+6WFfR1C5FKazXcZ/j40FJv+iLGBn3nkkgHlne61I8I7KhtQgIkmBMJIjPMkS/L051MeqdGScsKYTJuSucgI5c3+79eVH+y2TvbOTuuHv1uGxwXFb2atIU1ZYPbmmXculmizKcKIUiL64VMhr/ZycHJ3jpdZlyprBJR80ygAVuGrjl4whGbgBRkDPTwtXjYtgzmW74m0fDU2MZaxpBZZF8YurfocYcmDdcxFKeoIFQmVqAoAU+3YcXQt2xKThZZyV1v3sCvnzidUZtKM9cRRUfRWBtQSb50APM+gs/408xg7KHCB8AOKpZpfIpPhQ0RJhew8GR0aTqYsJo1IRCwM3UbbrvtJ7eqPMNzJcGcSYcQWm1FubInMonve94c+p8Vi2wc72MfReeFiTzMp1G6pDt2e40gPDGbdQI+jba4UjRlyA+9CbTW6Mt45W/80hW/gFEKh9+Klyky6FPenbJgt/vQK9TAiTA==; BDUSS=o4ZFV6UTVucGp0Rmx6TlNucFQ1Z1FEMnQyZ1ZOdmRUZWg2Nn5FQWxteWdBTVZvSVFBQUFBJCQAAAAAAAAAAAEAAAAXn3lCu8PRqdTGtssAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKBznWigc51oZW; BDUSS_BFESS=o4ZFV6UTVucGp0Rmx6TlNucFQ1Z1FEMnQyZ1ZOdmRUZWg2Nn5FQWxteWdBTVZvSVFBQUFBJCQAAAAAAAAAAAEAAAAXn3lCu8PRqdTGtssAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKBznWigc51oZW; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22370464293%22%2C%22first_id%22%3A%22198a7105b40582-0a1b1b0944bf378-4c657b58-1440000-198a7105b4183a%22%2C%22props%22%3A%7B%7D%2C%22%24device_id%22%3A%22198a7105b40582-0a1b1b0944bf378-4c657b58-1440000-198a7105b4183a%22%7D; MCITY=-179%3A; log_first_time=1755482524636; log_last_time=1755482544322; RT="z=1&dm=baidu.com&si=1403e7da-9af8-439d-bdca-61f492a1b52a&ss=mecm9ry0&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=jhi&ul=3upx87&hd=3upxa3"; H_PS_PSSID=62325_63147_63327_63948_64048_64174_64248_64245_64258_64260_64317_64358_64366_64362_64363_64395_64414_64429_64436_64442_64450_64457_64473_64483_64502_64512_64448_64087_64559_64571; BA_HECTOR=048lak8h81218h8h8020850k80a00g1ka54mp25; H_WISE_SIDS=62325_63147_63327_63948_64048_64174_64248_64245_64258_64260_64317_64358_64366_64362_64363_64395_64414_64429_64436_64442_64450_64457_64473_64483_64502_64512_64448_64087_64559_64571; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; COOKIE_SESSION=21_0_8_9_13_23_0_1_8_9_1_6_498875_0_0_0_1754446941_0_1755485313%7C9%231543375_16_1753882701%7C7; H_PS_645EC=1275d4%2BgYNOGPU5%2Fgp6XcloUiDEOGWs8LNx7nISyDCmJSXMYxQLNnwJypIA""",

View File

@@ -33,70 +33,8 @@ def printascii():
___) | __/ (_| | | | (__| | | | __/ |
|____/ \___|\__,_|_| \___|_| |_|\___|_|
''' + Fore.RESET)
# 天欣安全实验室
def writeExcel(titles, links,ws):
infos = list(zip(titles, links))
for row in infos:
ws.append(row)
def create_sheet_and_write(wb, engine, keywords, num, title):
ws = wb.create_sheet(title=title)
result = engine(keywords, num)
writeExcel(result[0], result[1], ws)
def excel_text2url(link_url): #如果函数内部没有进行异步操作,使用 async 并不会对性能或功能产生实际影响。
'''把一个网址字符串转换为 Excel公式使其可以点击直接转跳'''
return f'=HYPERLINK("{link_url}","{link_url}")'
# 遍历所有工作表,并将第二列的所有数据传递给 excel_text2url 函数重新赋值
def update_hyperlinks(wb):
for sheet in wb.worksheets: # 遍历每一个工作表
for row in sheet.iter_rows(min_row=1, max_row=sheet.max_row, min_col=2, max_col=2): # 遍历第二列
for cell in row:
if cell.value: # 检查单元格是否有内容
cell.value = excel_text2url(cell.value) # 将网址转换为超链接公式
else:
break
def commend():
parser = argparse.ArgumentParser(prog="Searcher", description='此工具用于对百度、必应和谷歌搜索的协程爬取--天欣安全实验室', usage='please read -h')
parser.add_argument("-k", type=str, help="搜索的关键词", nargs='+')
# 添加一个positional arguments叫a,读取类型为int默认是字符串
parser.add_argument("-p", type=str, help="需要搜索页数,默认为5,支持范围搜索例如搜索从第2页到第五页的参数为 2:5", default='5')
parser.add_argument("-m", type=str, help="使用的搜索引擎:百度:bd,必应:bin,谷歌:goo 不填写默认使用全部", default='all',nargs='+')
# parser.add_argument("-t", '--task', type=int, help="设置的线程,默认为8", default=8)
parser.exit_on_error = False
args = parser.parse_args()
if len(sys.argv) == 1:
printascii()
parser.print_help()
sys.exit()
return args
def search_company_info(company_name_key, addon_args, num):
search_key = company_name_key.strip() + " " + addon_args
search_key = search_key.strip()
result = Bing.bing_main(search_key, num)
# for 循环 遍历 result[0] 和 result[1]
return result
# for i in range(len(result[0])):
# title= result[0][i]
# url = result[1][i]
# print(f"必应搜索爬取结果为,title:{title}, url:{url}")
# if re.match(r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*", url):
# data_list.append({"title":title, "url":url})
# return data_list
def filter_company_sites(urls):
# urls https://www.tianyancha.com/company/5226478758
# url:https://aiqicha.baidu.com/company_detail_26602790857925
# url:https://www.qcc.com/firm/05b449eb5cc417d0f97c14104051f5c0.html
# 匹配 前缀https://aiqicha.baidu.com/company_detail_*,https://www.qcc.com/firm/*.html,https://www.tianyancha.com/company/5226478758*
filtered_urls = [url for url in urls if re.match(r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*", url)]
return filtered_urls
def filter_aiqicha_qcc(search_result, company_name_, with_not_match = False):
datas = []
@@ -125,100 +63,6 @@ def filter_aiqicha_qcc(search_result, company_name_, with_not_match = False):
datas.append(data_node)
return datas
def search_one_company(company_name_arg, num):
keywords = company_name_arg
# for key in keyword:
# keywords = keywords + key + " "
keywords = keywords.strip()
print(f"---==您搜索的关键词为:{keywords}")
wb = openpyxl.Workbook()
# 删除默认创建的工作表(现在名为 "数据表1"
wb.remove(wb['Sheet'])
printascii()
pattern = r"[\\/:\*\?\"<>|]"
keyword = re.sub(pattern, "", keywords)
create_sheet_and_write(wb, Bing.bing_main, keywords, num, "必应爬取结果")
create_sheet_and_write(wb, Baidu.baidu_main, keywords, num, "百度爬取结果")
# 将所有url变为超链接,点击即可打开转跳
update_hyperlinks(wb)
wb.save(f'./{keyword}-{company_name_arg}.xlsx')
print(Fore.GREEN + '总任务结束!' + Fore.RESET)
end = time.time()
print(Fore.RED + f'脚本总时间: {end - start:.2f}')
def save_to_csv(filter_list):
if filter_list is None or len(filter_list) == 0:
print('filter_list is None or len(filter_list) == 0, 没有数据可写入')
return False
"""
将结果追加写入csv文件中
Args:
filter_list: 需要写入的数据列表
"""
csv_file = 'company_search_result_data.csv'
headers = ['title', 'url', 'web_site_type', 'request_url', 'company_name', 'create_time']
# 判断文件是否存在,不存在则创建并写入列头
file_exists = os.path.exists(csv_file)
# 读取现有数据,用于判断重复项
existing_data = set()
if file_exists:
with open(csv_file, 'r', encoding='utf-8') as f:
reader_ins = csv.reader(f)
header_skipped = False
for row in reader_ins:
if not header_skipped:
header_skipped = True
continue
if len(row) >= 5: # 确保行数据完整
company_name = row[4] # company_name在第5列(索引4)
web_site_type = row[2] if len(row) > 2 else "" # web_site_type在第3列(索引2)
existing_data.add((company_name, web_site_type))
# 写入数据
with open(csv_file, 'a', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
# 如果文件不存在,写入列头
if not file_exists:
writer.writerow(headers)
# 追加写入数据,去重处理
for data_node in filter_list:
company_name = data_node.get('company_name', '')
web_site_type = data_node.get('web_site_type', '')
# 判断是否已存在相同的company_name和web_site_type组合
if (company_name, web_site_type) not in existing_data:
# 创建时间格式化
create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# 写入数据行
row_data = [
data_node.get('title', ''),
data_node.get('url', ''),
web_site_type,
data_node.get('request_url', ''),
company_name,
create_time
]
writer.writerow(row_data)
# 添加到已存在数据集合中,避免本次写入中的重复
existing_data.add((company_name, web_site_type))
print(f"写入数据成功,title:{data_node.get('title', '')}, "
f"url:{data_node.get('url', '')}, "
f"web_site_type:{web_site_type}, "
f"request_url:{data_node.get('request_url', '')}, "
f"company_name:{company_name}, "
f"create_time:{create_time}")
def check_company_exists(company_names, type_list):
"""
@@ -379,38 +223,9 @@ if __name__ == '__main__':
pass
pass
if True:
print("exit")
exit(0)
i = 1
for company_name_ele in check_result:
company_name = company_name_ele["company_name"]
company_exists = company_name_ele["exists"]
# 如果公司已存在,跳过处理
if company_exists:
print(f"公司 {company_name} 已存在,跳过处理")
continue
sleep_time = 5
sleep_time += random.randint(3, 10)
time.sleep(sleep_time)
addon_args = " 爱企查|企查查"
data_list = search_company_info(company_name, addon_args, '1')
filter_list = filter_aiqicha_qcc(data_list, company_name)
print("filter_list:",filter_list)
save_to_csv(filter_list)
if len(filter_list)<= 0:
print("没有数据 filter_list is empty. "+company_name)
continue
i=i+1
if i > 100:
break

197
crawler_bing_main.py.bak Normal file
View File

@@ -0,0 +1,197 @@
def writeExcel(titles, links,ws):
infos = list(zip(titles, links))
for row in infos:
ws.append(row)
def create_sheet_and_write(wb, engine, keywords, num, title):
ws = wb.create_sheet(title=title)
result = engine(keywords, num)
writeExcel(result[0], result[1], ws)
def excel_text2url(link_url): #如果函数内部没有进行异步操作,使用 async 并不会对性能或功能产生实际影响。
'''把一个网址字符串转换为 Excel公式使其可以点击直接转跳'''
return f'=HYPERLINK("{link_url}","{link_url}")'
# 遍历所有工作表,并将第二列的所有数据传递给 excel_text2url 函数重新赋值
def update_hyperlinks(wb):
for sheet in wb.worksheets: # 遍历每一个工作表
for row in sheet.iter_rows(min_row=1, max_row=sheet.max_row, min_col=2, max_col=2): # 遍历第二列
for cell in row:
if cell.value: # 检查单元格是否有内容
cell.value = excel_text2url(cell.value) # 将网址转换为超链接公式
else:
break
def commend():
parser = argparse.ArgumentParser(prog="Searcher", description='此工具用于对百度、必应和谷歌搜索的协程爬取--天欣安全实验室', usage='please read -h')
parser.add_argument("-k", type=str, help="搜索的关键词", nargs='+')
# 添加一个positional arguments叫a,读取类型为int默认是字符串
parser.add_argument("-p", type=str, help="需要搜索页数,默认为5,支持范围搜索例如搜索从第2页到第五页的参数为 2:5", default='5')
parser.add_argument("-m", type=str, help="使用的搜索引擎:百度:bd,必应:bin,谷歌:goo 不填写默认使用全部", default='all',nargs='+')
# parser.add_argument("-t", '--task', type=int, help="设置的线程,默认为8", default=8)
parser.exit_on_error = False
args = parser.parse_args()
if len(sys.argv) == 1:
printascii()
parser.print_help()
sys.exit()
return args
# for i in range(len(result[0])):
# title= result[0][i]
# url = result[1][i]
# print(f"必应搜索爬取结果为,title:{title}, url:{url}")
# if re.match(r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*", url):
# data_list.append({"title":title, "url":url})
# return data_list
def filter_company_sites(urls):
# urls https://www.tianyancha.com/company/5226478758
# url:https://aiqicha.baidu.com/company_detail_26602790857925
# url:https://www.qcc.com/firm/05b449eb5cc417d0f97c14104051f5c0.html
# 匹配 前缀https://aiqicha.baidu.com/company_detail_*,https://www.qcc.com/firm/*.html,https://www.tianyancha.com/company/5226478758*
filtered_urls = [url for url in urls if re.match(r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*", url)]
return filtered_urls
def save_to_csv(filter_list):
if filter_list is None or len(filter_list) == 0:
print('filter_list is None or len(filter_list) == 0, 没有数据可写入')
return False
"""
将结果追加写入csv文件中
Args:
filter_list: 需要写入的数据列表
"""
csv_file = 'company_search_result_data.csv'
headers = ['title', 'url', 'web_site_type', 'request_url', 'company_name', 'create_time']
# 判断文件是否存在,不存在则创建并写入列头
file_exists = os.path.exists(csv_file)
# 读取现有数据,用于判断重复项
existing_data = set()
if file_exists:
with open(csv_file, 'r', encoding='utf-8') as f:
reader_ins = csv.reader(f)
header_skipped = False
for row in reader_ins:
if not header_skipped:
header_skipped = True
continue
if len(row) >= 5: # 确保行数据完整
company_name = row[4] # company_name在第5列(索引4)
web_site_type = row[2] if len(row) > 2 else "" # web_site_type在第3列(索引2)
existing_data.add((company_name, web_site_type))
# 写入数据
with open(csv_file, 'a', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
# 如果文件不存在,写入列头
if not file_exists:
writer.writerow(headers)
# 追加写入数据,去重处理
for data_node in filter_list:
company_name = data_node.get('company_name', '')
web_site_type = data_node.get('web_site_type', '')
# 判断是否已存在相同的company_name和web_site_type组合
if (company_name, web_site_type) not in existing_data:
# 创建时间格式化
create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# 写入数据行
row_data = [
data_node.get('title', ''),
data_node.get('url', ''),
web_site_type,
data_node.get('request_url', ''),
company_name,
create_time
]
writer.writerow(row_data)
# 添加到已存在数据集合中,避免本次写入中的重复
existing_data.add((company_name, web_site_type))
print(f"写入数据成功,title:{data_node.get('title', '')}, "
f"url:{data_node.get('url', '')}, "
f"web_site_type:{web_site_type}, "
f"request_url:{data_node.get('request_url', '')}, "
f"company_name:{company_name}, "
f"create_time:{create_time}")
def search_company_info(company_name_key, addon_args, num):
search_key = company_name_key.strip() + " " + addon_args
search_key = search_key.strip()
result = Bing.bing_main(search_key, num)
# for 循环 遍历 result[0] 和 result[1]
return result
def search_one_company(company_name_arg, num):
keywords = company_name_arg
# for key in keyword:
# keywords = keywords + key + " "
keywords = keywords.strip()
print(f"---==您搜索的关键词为:{keywords}")
wb = openpyxl.Workbook()
# 删除默认创建的工作表(现在名为 "数据表1"
wb.remove(wb['Sheet'])
printascii()
pattern = r"[\\/:\*\?\"<>|]"
keyword = re.sub(pattern, "", keywords)
create_sheet_and_write(wb, Bing.bing_main, keywords, num, "必应爬取结果")
create_sheet_and_write(wb, Baidu.baidu_main, keywords, num, "百度爬取结果")
# 将所有url变为超链接,点击即可打开转跳
update_hyperlinks(wb)
wb.save(f'./{keyword}-{company_name_arg}.xlsx')
print(Fore.GREEN + '总任务结束!' + Fore.RESET)
end = time.time()
print(Fore.RED + f'脚本总时间: {end - start:.2f}')
if __name__ == '__main__':
if True:
print("exit")
exit(0)
i = 1
for company_name_ele in check_result:
company_name = company_name_ele["company_name"]
company_exists = company_name_ele["exists"]
# 如果公司已存在,跳过处理
if company_exists:
print(f"公司 {company_name} 已存在,跳过处理")
continue
sleep_time = 5
sleep_time += random.randint(3, 10)
time.sleep(sleep_time)
addon_args = " 爱企查|企查查"
data_list = search_company_info(company_name, addon_args, '1')
filter_list = filter_aiqicha_qcc(data_list, company_name)
print("filter_list:",filter_list)
save_to_csv(filter_list)
if len(filter_list)<= 0:
print("没有数据 filter_list is empty. "+company_name)
continue
i=i+1
if i > 100:
break

View File

@@ -18,12 +18,57 @@ timeout = aiohttp.ClientTimeout(
sock_connect=5.5, # 连接超时时间5.5
sock_read=5.5 # 读取超时为5.5秒
)
# 新增基于 Playwright 的 Bing 搜索函数
async def getbing_with_click(keyword, page_num=1):
"""使用 Playwright 进行 Bing 搜索并模拟点击"""
from tool.web_browser import WebBrowser
import asyncio
data_list = []
browser = WebBrowser()
try:
# 启动浏览器
await browser.start_browser_async()
# 访问 Bing 搜索页面
search_url = f'https://cn.bing.com/search?q={keyword}&first={(page_num - 1) * 10 + 1}&count=10'
success = browser.visit_page(search_url)
if success:
# 等待页面加载完成
await asyncio.sleep(10) # 等待3秒
browser.input_and_enter('#sb_form_q', keyword)
# 等待页面加载完成
# 或者等待特定元素出现
# browser.page.wait_for_selector('h2 a', timeout=10000)
# 等待页面加载完成
await asyncio.sleep(20) # 等待3秒
# 提取搜索结果
data_list = browser.extract_links('h2 a')
except Exception as e:
print(f"Bing页面爬取失败: {e}")
finally:
await browser.close_browser_async()
return data_list
async def getbing(url, session):
# url_list = []
# title_list = []
data_list =[]
async with session.get(url, headers=bingheaders,timeout=timeout) as resp:
# print("正在爬取url:"+url)
# # 使用通用方法点击搜索按钮
# browser.interact_with_element('#sb_form_go', 'click')
try:
a = await resp.text()
soup = BeautifulSoup(a, 'lxml')
@@ -47,20 +92,54 @@ async def getbing(url, session):
#url_list, title_list
# 在 [getbing](file:///Users/liyaya/gitstudy/Spider/SearchCompany/search/Bing.py#L20-L46) 函数中添加表单提交方式
async def getbingPost(keyword, session):
data_list = []
# 先访问首页获取必要的cookies和参数
async with session.get('https://cn.bing.com', headers=bingheaders, timeout=timeout) as resp:
homepage = await resp.text()
# 模拟表单提交搜索
search_data = {
'q': keyword,
'go': 'Submit',
'first': '1',
'count': '10'
}
search_url = 'https://cn.bing.com/search'
async with session.post(search_url, data=search_data, headers=bingheaders, timeout=timeout) as resp:
try:
a = await resp.text()
soup = BeautifulSoup(a, 'lxml')
h2a = soup.select('h2 a')
for h in h2a:
htext = h.text.replace('\n', '').replace(',', ' ').strip()
hurl = h.get('href')
if not hurl.startswith(('http://', 'https://')):
domain = 'https://cn.bing.com/'
hurl = urllib.parse.urljoin(domain, hurl)
print(htext, " ", hurl)
data_list.append({'title': htext, 'url': hurl, 'request_url': search_url})
except:
print(f"必应页面爬取失败,该url无法正常获取数据。")
return []
return data_list
async def bing_spinder(keyword, num):
print(f'必应爬取任务进行中,爬取页数为{num}...')
print('标题 url')
# urllist = []
# titlelist = []
data_list = []
tasks = []
if ':' in num:
if num.count(':') > 1:
raise ValueError("输入中必须且只能包含一个 ':'")
else:
# 分割字符串,确保分割后的两部分都是数字
start_page, end_page = num.split(':')
# 判断两边是否都是数字
if not (start_page.isdigit() and end_page.isdigit()):
raise ValueError("':' 两侧的值必须是数字")
else:
@@ -68,23 +147,77 @@ async def bing_spinder(keyword, num):
end_page = (int(end_page)) * 10
else:
start_page, end_page = 0, int(num) * 10
async with aiohttp.ClientSession() as session:
for pn in range(start_page, end_page, 10):
#url = f'https://cn.bing.com/search?q={keyword}&first={pn}&mkt=zh-CN'
# 修复:使用正确的分页参数
url = f'https://cn.bing.com/search?q={keyword}&first={pn + 1}&count=10&FORM=PERE'
print("正在爬取的url为:" + url)
tasks = tasks + [asyncio.create_task(getbing(url, session))]
tasks = tasks + [asyncio.create_task(getbing_with_click(keyword, session))]
# for pn in range(start_page, end_page, 10):
#
# tasks = tasks + [asyncio.create_task(getbing_with_click(keyword, session))]
# 直接传递keyword而不是构建URL
# tasks = tasks + [asyncio.create_task(getbing(keyword, session))]
result = await asyncio.gather(*tasks)
for i in range(int((end_page-start_page) / 10)):
# urllist += result[i][0]
# titlelist += result[i][1]
data_list += result[i]
for res in result:
data_list += res
count = len(data_list)
print(f"必应搜索爬取结果为{count}")
print(Fore.GREEN + '必应爬取任务完成\n' + Fore.RESET)
return data_list
# await bingwriteCSV(titlelist, urllist, keyword)
# async def bing_spinder(keyword, num):
# print(f'必应爬取任务进行中,爬取页数为{num}...')
# print('标题 url')
# # urllist = []
# # titlelist = []
# data_list =[]
# tasks = []
# if ':' in num:
# if num.count(':') > 1:
# raise ValueError("输入中必须且只能包含一个 ':'")
# else:
# # 分割字符串,确保分割后的两部分都是数字
# start_page, end_page = num.split(':')
# # 判断两边是否都是数字
# if not (start_page.isdigit() and end_page.isdigit()):
# raise ValueError("':' 两侧的值必须是数字")
# else:
# start_page = (int(start_page)-1)*10
# end_page = (int(end_page))*10
# else:
# start_page, end_page =0,int(num) * 10
# async with aiohttp.ClientSession() as session:
#
# for pn in range(start_page, end_page, 10):
# url = f'https://cn.bing.com/search?q={keyword}&first={pn + 1}&count=10'
# print("正在爬取的url为:" + url)
# tasks = tasks + [asyncio.create_task(getbing(url, session))]
# result = await asyncio.gather(*tasks)
# #
# # url = f'https://cn.bing.com/search?q={keyword}&go=Submit&first={pn + 1}&count=10'
# # # url = f'https://cn.bing.com/search?q={keyword}' # &first={pn + 1}' #&count=10&FORM=PERE'
# # print("正在爬取的url为:" + url)
# # tasks = tasks + [asyncio.create_task(getbing(url, session))]
# # # for pn in range(start_page, end_page, 10):
# # # #url = f'https://cn.bing.com/search?q={keyword}&first={pn}&mkt=zh-CN'
# # # # 修复:使用正确的分页参数
# # # url = f'https://cn.bing.com/search?q={keyword}' #&first={pn + 1}' #&count=10&FORM=PERE'
# # # print("正在爬取的url为:" + url)
# # # tasks = tasks + [asyncio.create_task(getbing(url, session))]
# # result = await asyncio.gather(*tasks)
# for i in range(int((end_page-start_page) / 10)):
# # urllist += result[i][0]
# # titlelist += result[i][1]
# data_list += result[i]
# count=len(data_list)
# print(f"必应搜索爬取结果为{count}")
# print(Fore.GREEN + '必应爬取任务完成\n' + Fore.RESET)
# return data_list
# # await bingwriteCSV(titlelist, urllist, keyword)
def bing_main(keyword,num):

View File

@@ -42,6 +42,155 @@ class WebBrowser:
except:
pass
def click_element(self, selector):
"""
模拟点击页面元素
Args:
selector (str): CSS选择器或XPath表达式
Returns:
bool: 点击成功返回True否则返回False
"""
try:
# 等待元素出现
self.page.wait_for_selector(selector, timeout=10000)
# 查找元素
element = self.page.query_selector(selector)
if not element:
print(f"未找到元素: {selector}")
return False
# 模拟鼠标移动到元素
element.hover()
time.sleep(random.uniform(0.5, 1.0))
# 点击元素
element.click()
# 模拟人类点击后的等待
time.sleep(random.uniform(1, 2))
return True
except Exception as e:
print(f"点击元素失败: {selector}, 错误: {str(e)}")
return False
def input_and_enter(self, selector, text):
"""
在指定输入框输入文本并按回车键
Args:
selector (str): 输入框的CSS选择器
text (str): 要输入的文本
Returns:
bool: 输入成功返回True否则返回False
"""
try:
# 等待输入框出现
self.page.wait_for_selector(selector, timeout=10000)
# 查找输入框元素
input_element = self.page.query_selector(selector)
if not input_element:
print(f"未找到输入框: {selector}")
return False
# 点击输入框以获得焦点
input_element.click()
time.sleep(random.uniform(0.5, 1.0))
# 清空现有内容并输入新文本
input_element.fill(text)
# 模拟输入间隔
time.sleep(random.uniform(1, 2))
# 按回车键
input_element.press('Enter')
# 等待页面响应
time.sleep(random.uniform(2, 3))
return True
except Exception as e:
print(f"输入并回车失败: {selector}, 错误: {str(e)}")
return False
def interact_with_element(self, selector, action_type="click", text=None, callback=None):
"""
通用元素交互方法,支持多种操作类型和回调
Args:
selector (str): 元素的CSS选择器
action_type (str): 操作类型 ("click", "input_enter", "hover")
text (str): 输入文本仅在action_type为"input_enter"时需要)
callback (function): 回调函数,在操作完成后执行
Returns:
bool: 操作成功返回True否则返回False
"""
try:
# 等待元素出现
self.page.wait_for_selector(selector, timeout=10000)
element = self.page.query_selector(selector)
if not element:
print(f"未找到元素: {selector}")
return False
result = False
if action_type == "click":
# 模拟点击
element.hover()
time.sleep(random.uniform(0.5, 1.0))
element.click()
result = True
elif action_type == "input_enter":
if text is None:
print("输入操作需要提供text参数")
return False
# 模拟输入并回车
element.click()
time.sleep(random.uniform(0.5, 1.0))
element.fill(text)
time.sleep(random.uniform(1, 2))
element.press('Enter')
result = True
elif action_type == "hover":
# 模拟悬停
element.hover()
time.sleep(random.uniform(1, 2))
result = True
# 模拟人类行为延迟
time.sleep(random.uniform(1, 2))
# 执行回调函数
if callback and callable(callback):
try:
callback(result, selector)
except Exception as e:
print(f"回调函数执行失败: {e}")
return result
except Exception as e:
print(f"元素交互失败: {selector}, 错误: {str(e)}")
if callback and callable(callback):
try:
callback(False, selector)
except Exception as cb_e:
print(f"回调函数执行失败: {cb_e}")
return False
def get_random_user_agent(self):
"""获取随机User-Agent"""
user_agents = [
@@ -345,7 +494,7 @@ class WebBrowser:
# self.page.wait_for_load_state("networkidle")
# 3. 等待页面加载状态而不是特定元素
try:
self.page.wait_for_load_state('networkidle', timeout=5000)
self.page.wait_for_load_state('networkidle', timeout=15000)
print("networkidle, timeout=5000页面已加载")
except Exception as e:
print(f"等待页面加载状态时出错: {e}")
@@ -394,3 +543,30 @@ class WebBrowser:
print(f"提取链接失败: {e}")
return links
#
# # 模拟点击搜索按钮
# browser.click_element('#sb_form_go')
#
# # 在搜索框输入并回车
# browser.input_and_enter('#sb_form_q', '搜索关键词')
#
# # 使用通用方法点击搜索按钮
# browser.interact_with_element('#sb_form_go', 'click')
#
# # 使用通用方法输入并回车
# browser.interact_with_element('#sb_form_q', 'input_enter', '搜索关键词')
#
# # 带回调的交互
# def search_callback(success, selector):
# if success:
# print(f"成功操作元素: {selector}")
# else:
# print(f"操作元素失败: {selector}")
#
# browser.interact_with_element(
# '#sb_form_q',
# 'input_enter',
# '搜索关键词',
# search_callback
# )