给大家分享一个早前爬取东方财富网股票信息的爬虫程序,回头来看做了好多改进,特别是数据处理部分使用了heapd模块,方便快捷一步到位...
1 # _*_ coding:utf-8 _*_ 2 3 import requests,re,json,time,os 4 import heapq 5 from bs4 import BeautifulSoup 6 7 class GPINFO(object): 8 """docstring for GPINFO""" 9 def __init__(self):10 self.Url = 'http://quote.eastmoney.com/stocklist.html'11 self.BaseData = []12 self.Date = time.strftime('%Y%m%d')13 self.Record = 'basedata'+self.Date14 if os.path.exists(self.Record):15 print ('record exist...')16 self.BaseData = self.get_base_data_from_record()17 else:18 print ('fuck-get data again...')19 self.get_data()20 21 def write_record(self,text):22 with open(self.Record,'ab') as f:23 f.write((text+'\n').encode('utf-8'))24 25 def get_base_data_from_record(self):26 ll = []27 with open(self.Record,'rb') as f:28 json_l = f.readlines()29 for j in json_l:30 ll.append(json.loads(j.decode('utf-8')))31 return ll32 33 def get_data(self):34 #请求数据35 orihtml = requests.get(self.Url).content36 #创建 beautifulsoup 对象37 soup = BeautifulSoup(orihtml,'lxml')38 #采集每一个股票的信息39 count = 040 for a in soup.find('div',class_='quotebody').find_all('a',{ 'target':'_blank'}):41 record_d = {}42 #代号43 num = a.get_text().split('(')[1].strip(')')44 if not (num.startswith('00') or num.startswith('60')):continue #只需要6*/0*45 record_d['num']=num46 #名称47 name = a.get_text().split('(')[0]48 record_d['name']=name49 #详情页50 detail_url = a['href']51 record_d['detail_url']=detail_url52 53 cwzburl = detail_url54 #发送请求55 try:56 cwzbhtml = requests.get(cwzburl,timeout=30).content57 except Exception as e:58 print ('perhaps timeout:',e)59 continue60 #创建soup对象61 cwzbsoup = BeautifulSoup(cwzbhtml,'lxml')62 63 #财务指标列表 [浦发银行,总市值 净资产 净利润 市盈率 市净率 毛利率 净利率 ROE] roe:净资产收益率64 try:65 cwzb_list = cwzbsoup.find('div',class_='cwzb').tbody.tr.get_text().split()66 except Exception as e:67 print ('error:',e)68 continue69 #去除退市股票70 if '-' not in cwzb_list:71 record_d['data']=cwzb_list72 self.BaseData.append(record_d)73 self.write_record(json.dumps(record_d))74 count=count+175 print (len(self.BaseData))76 77 def main():78 test = GPINFO()79 result = test.BaseData80 #[浦发银行,总市值 净资产 净利润 市盈率 市净率 毛利率 净利率 ROE] roe:净资产收益率]81 top_10 = heapq.nlargest(10,result,key=lambda r:float(r['data'][7].strip('%')))82 for i in top_10:83 print(i['data'])84 85 if __name__ == '__main__':86 main()
程序主函数部分是为了获取净利率前10名的股票信息,打印结果如下:
['绵石投资', '52.2亿', '14.0亿', '1.25亿', '30.90', '3.73', '42.25%', '2047.04%', '9.27%']['国投安信', '556亿', '270亿', '21.1亿', '19.80', '2.12', '5.90%', '487.53%', '7.79%']['川投能源', '379亿', '202亿', '28.0亿', '10.16', '1.91', '37.01%', '402.64%', '14.58%']['ST明科', '47.6亿', '9.25亿', '5.11千万', '68.00', '5.14', '2.38%', '345.11%', '5.68%']['华联控股', '93.6亿', '31.5亿', '4.76亿', '14.54', '3.74', '46.25%', '328.53%', '20.88%']['上海九百', '68.2亿', '12.3亿', '1.61亿', '31.67', '5.56', '54.00%', '297.99%', '13.21%']['凯瑞德', '46.7亿', '1.14亿', '3.27千万', '107.10', '40.94', '16.07%', '294.19%', '33.41%']['鲁信创投', '172亿', '38.6亿', '3.32亿', '38.48', '4.64', '28.67%', '244.43%', '9.26%']['博闻科技', '35.0亿', '6.56亿', '2.23千万', '117.65', '5.36', '-16.07%', '215.27%', '3.41%']['万泽股份', '71.8亿', '13.7亿', '6.87千万', '78.38', '5.29', '22.57%', '203.15%', '5.13%']