爬虫练习2
##spider_MCENet.py
##目的:爬取http://bioinformatics.cau.edu.cn/MCENet/search_result.php?gene=GRMZM2G021617%0D%0A&query=Zm_Oth_Ara
##为了获取相关的同源基因
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#依赖 模块讲解 requests是请求网页 random随机函数 csv csv数据读写 re 正则表达式 time 定时操作 BeautifulSoup 解析html的包
import requests,random
import csv,re
import time
from bs4 import BeautifulSoup
#https://www.ncbi.nlm.nih.gov/gene/?term=Zm00001d036521
#payload={'term':'Zm00001d036521'}
#genename='Zm00001d036521'
#反反爬虫部署,添加headers,random访问,增加代理,使用代理访问。
user_agents=['Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11']
gene_name=["origin_V3"]
Atr_name=["ATR"]
annotation=["description"]
p_value=["0.5"]
# 判断内容是否存在
#参数htmlcontent 抓的html , content_selector 是要判断是否存在的字符串
#用法示例 judge=chargecontent(html,"not available")
def chargecontent(htmlcontent, content_selector):
soup = BeautifulSoup(htmlcontent, "html.parser")
# 去除script
s=soup.get_text("/",strip=True)
hascontent = False
li=re.findall(content_selector,s)
hascontent = len(li) > 0
return hascontent
##定义主爬取函数,需要传入参数为genename V3版本
def getGid(genename):
#files={'file':open('deg.csv','rb')}
#payload={'gene':'GRMZM2G147279','query':'Zm_Oth_Ara'}
payload={'gene':genename,'query':'Zm_Oth_Ara'}
headers={'User-Agent':random.choice(user_agents)}
proxies={'http':'74.59.132.126:49073','https':'74.59.132.126:49073'}
url="http://bioinformatics.cau.edu.cn/MCENet/search_result.php"
#req=requests.get(url,headers=headers,params=payload,proxies=proxies)
req=requests.get(url,headers=headers,params=payload)
html=req.text
##预先判断是否存在页面是否被返回正常值,否,则不解析文本
judge=chargecontent(html,"not available")
if judge:
out_data=[genename,"","",""]
else:
bf=BeautifulSoup(html,"html5lib")
thread=bf.find('tbody')
gene1=thread.select_one('tr >td:nth-of-type(1)').text
gene2=thread.select_one('tr >td:nth-of-type(2)').get_text()
gene3=thread.select_one('tr >td:nth-of-type(3)').text
gene4=thread.select_one('tr >td:nth-of-type(4)').text
# # gene_name.append(gene1)
# # Atr_name.append(gene2)
# # annotation.append(gene3)
# # p_value.append(gene4)
# tr=trx.find_all('td')
# a_bf=BeautifulSoup(str(thread[0]))
# a=a_bf.find_all('span')
# genename=thread.find_all('td',limit=3)
#此处的gid就是我们要的基因的GID值
#gid = a[0].text.replace('\xa0'*8,'\n\n')
out_data=[gene1,gene2,gene3,gene4]
return out_data
genecount=csv.reader(open('spider.csv','r'))
gene_table=['gene_name','Atr_name','annotation','p_value']
for geneid in genecount:
print(geneid[0])
getdata=getGid(geneid[0])
gene_table.append(getdata)
time.sleep(random.random()) #暂停[0,1)秒
#gene_table=[gene_name,Atr_name,annotation,p_value]
headers=['V3','Atr','description','pvalue']
with open('out_spider.csv','w',newline='') as f:
writer=csv.writer(f)
writer.writerow(headers)
#for row in gene_table:
# writer.writerow(row)
writer.writerows(gene_table)
主要练习目标: 数据的获取、反反爬虫部署、数据识别、数据解析、数据读取输出 深层目标: 优化反反爬虫部署、增加判断浏览器返回值状态码,如果超时或者被反爬虫发现,需要给出反馈。 重点:python代码一定不要有多余的空格或者缩进,行尾要特别注意不要多空格,行头缩进全部使用tab,不能使用空格。