暂无分类
暂无标签
发布于2019-10-29 16:37 阅读(393) 评论(0) 点赞(12) 收藏(4)
0
1
2
3
4
5
6
7
8
最近小组作业是写一个空气网站的爬虫和模型分析,我主要负责爬虫的部分。
分析网站是这个:绿色呼吸空气监测网
爬取全部城市、url和单个城市当天总体和各个监测点数据都进行得很顺利,但卡在了提取24h和30天内pm2.5的变化部分。
选一个具体城市进行分析:北京城市空气实时监测网
在实时监测的图标上右键查看源代码,发现这是个动态加载模块(点其他点儿它是会变的。。)
于是搜了一下这个简单直白的“美国标准”,发现了这个:
定位到具体源代码,发现在最后一个script里:
源代码如下:
<script>
seajs.use(['common', 'tabs', 'echarts'], function(common, Tabs){
$(document).ready(function(){
common.listHover();
new Tabs({
element: '.pj_area',
triggers: '.pj_area_tab a',
triggerType: 'click',
activeTriggerClass: 'current',
panels: '.pj_area_data_details',
activeIndex: 0
});
new Tabs({
element: '.panel_graph',
triggers: '.pg_area_tab a',
triggerType: 'click',
activeTriggerClass: 'current',
panels: '.pg_content',
activeIndex: 0
});
// 基于准备好的dom,初始化echarts图表
var myChart = echarts.init(document.getElementById('pg_content_24h'), 'shine');
var option = {
color: ['#79b05f', '#e58c65'],
tooltip : {
trigger: 'axis'
},
legend: {
data:['美国标准','中国标准']
},
xAxis : [
{
type : 'category',
boundaryGap : false,
data : ["27\u65e519\u65f6","27\u65e520\u65f6","27\u65e521\u65f6","27\u65e522\u65f6","27\u65e523\u65f6","28\u65e500\u65f6","28\u65e501\u65f6","28\u65e502\u65f6","28\u65e503\u65f6","28\u65e504\u65f6","28\u65e505\u65f6","28\u65e506\u65f6","28\u65e507\u65f6","28\u65e508\u65f6","28\u65e509\u65f6","28\u65e510\u65f6","28\u65e511\u65f6","28\u65e512\u65f6","28\u65e513\u65f6","28\u65e514\u65f6","28\u65e515\u65f6","28\u65e516\u65f6","28\u65e517\u65f6","28\u65e518\u65f6"],
axisLine: {
lineStyle : {
color: '#d4d4d4'
}
}
}
],
yAxis : [
{
type : 'value',
axisLabel : {
formatter: '{value} '
},
axisLine: {
lineStyle : {
color: '#d4d4d4'
}
}
}
],
series : [
{
name:'美国标准',
type:'line',
data:[133,144,154,160,162,160,157,154,148,217,352,308,422,393,231,154,104,77,61,53,48,48,43,44] },
{
name:'中国标准',
type:'line',
data:[77,86,99,110,114,111,105,103,110,214,358,312,427,399,231,155,104,77,61,53,48,48,43,42] }
]
};
// 为echarts对象加载数据
myChart.setOption(option);
// 基于准备好的dom,初始化echarts图表
var myChart = echarts.init(document.getElementById('pg_content_30d'), 'shine');
var option = {
color: ['#79b05f', '#e58c65'],
tooltip : {
trigger: 'axis'
},
legend: {
data:['美国标准','中国标准']
},
xAxis : [
{
type : 'category',
boundaryGap : false,
data : ["16\u65e5","17\u65e5","18\u65e5","19\u65e5","20\u65e5","21\u65e5","22\u65e5","23\u65e5","24\u65e5","25\u65e5","26\u65e5","27\u65e5","28\u65e5","29\u65e5","12\u65e5","13\u65e5","14\u65e5","15\u65e5","16\u65e5","17\u65e5","18\u65e5","19\u65e5","20\u65e5","21\u65e5","22\u65e5","23\u65e5","24\u65e5","25\u65e5","26\u65e5","27\u65e5"],
axisLine: {
lineStyle : {
color: '#d4d4d4'
}
}
}
],
yAxis : [
{
type : 'value',
axisLabel : {
formatter: '{value} '
},
axisLine: {
lineStyle : {
color: '#d4d4d4'
}
}
}
],
series : [
{
name:'美国标准',
type:'line',
data:[56,92,34,72,97,134,127,72,47,78,103,90,130,152,90,68,32,53,75,123,117,158,104,52,105,152,111,29,78,123] },
{
name:'中国标准',
type:'line',
data:[45,60,30,58,63,92,78,52,39,56,66,57,81,97,67,49,25,34,46,72,69,108,100,47,77,103,91,23,49,73] }
]
};
// 为echarts对象加载数据
myChart.setOption(option);
});
});
</script>
兴冲冲用了css和xpath提取方法,打出来都是无情的空列表。
于是开始了漫长的百度时光。
方法一:使用beautifulsoup:
import requests
from bs4 import BeautifulSoup
url = "http://www.pm25.com/city/beijing.html"
r = requests.get(url)
soup = BeautifulSoup(r.text,'html.parser')
temps = soup.find_all("script",)
i = 0
for temp in temps:
print(i)
i = i+1
print(temp)
结果如图:
能打印出来结果,但前面都是使用的scrapy,组员表示这样会有点慢。
遂PASS。
方法二:使用 selenium + webdriver:
import scrapy
from scrapy.selector import Selector
import requests
import re
from selenium import webdriver
class test_ajaxSpider(scrapy.Spider):
name = "aajax"
start_urls = ["http://www.pm25.com/city/beijing.html"]
def parse(self, response):
data = stan_24h()
sel = Selector(response)
print("**********************")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get("http://www.pm25.com/city/beijing.html")
data = driver.page_source
print(data)
driver.__exit__()
流程是python执行,打开chrome浏览器并访问start_urls,获取网页数据并关闭网页。可以成功打印script数据,但。。。更。。慢。。
(讲道理我觉得这个方法虽然慢但是用户体验极佳,感觉自己好厉害的样子)
方法三:使用response.text进行提取。
经过css提取法、xpath提取法、select、find_all等多种方法失败以后,我产生了一个疑问:会不会是这些选择器在初步选择的时候就把我要的这个script中的数据过滤掉了(。。。),所以能找到这个标签但内容始终是空(没学过网页设计的我曾天真地以为里面没有标签的函数之类的都是text())。
于是在scrapy框架中,使用response.text打印出了整个网页的内容,并用正则表达式匹配提取出了完整数据。最终成功。
完整代码如下:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
import re
import time
class stan_24h():
def __init__(self):
self.COUN = [{}]
def pmchart(i,all_code):
if i == 0:
startnum = all_code.rfind('series :')
endnum = all_code.rfind(' // 为echarts对象加载数据')
pm24_data = re.findall("data:[^\s]*", all_code[startnum:endnum])
return pm24_data
elif i == 1 :
startnum = all_code.find('series :')
endnum = all_code.find(' // 为echarts对象加载数据')
pm30_data = re.findall("data:[^\s]*", all_code[startnum:endnum])
return pm30_data
else:
return False
def addpm242self(numlist,mon,day,hour):
i = 0
coun = [{}]
for num in numlist:
if hour == 24:
hour = 0
day = day+1
else:
pass
coun += [{str(mon)+'-'+str(day)+'-'+str(hour):num}]
i = i+1
hour = hour+1
coun = coun[1:]
return coun
def addpm302self(numlist,year,mon,day):
i = 29
coun = [{}]
n = []
x = day
y = 29
while i >= 0:
if i>=12:
n.append(x)
x = x-1
else:
n.append(y)
y = y-1
i = i-1
for num in numlist:
coun += [{n[i]:num}]
i = i+1
coun = coun[1:]
return coun
class test_ajaxSpider(scrapy.Spider):
name = "aajax"
start_urls = ["http://www.pm25.com/city/beijing.html"]
def parse(self, response):
data = stan_24h()
sel = Selector(response)
print("**********************")
all_code = response.text
#提取24小时内和30小时内的data数据
pm24_data = pmchart(0,all_code)
pm30_data = pmchart(1,all_code)
year = int(time.strftime('%Y'))
mon = int(time.strftime('%m'))
day = int(time.strftime('%d'))
hour = int(time.strftime('%H'))+1
temp_c = 0
#正则匹配提取每日pm2.5数据并创建列表
while temp_c <=1:
pm30temp = re.findall(r"\d+\.?\d*", pm24_data[temp_c])
pm24temp = re.findall(r"\d+\.?\d*", pm30_data[temp_c])
#调用函数生成结果列表
if temp_c == 0:
data.COUN[0] = addpm242self(pm24temp,mon,day-1,hour)
data.COUN.append(addpm302self(pm30temp, mon, day, hour))
else:
data.COUN.append(addpm242self(pm24temp,mon,day-1,hour))
data.COUN.append(addpm302self(pm30temp, mon, day, hour))
temp_c = temp_c+1
print('24h内美国标准:',data.COUN[0],'\n24h内中国标准:',data.COUN[2])
print('30天内美国标准:', data.COUN[1], '\n30天内中国标准:', data.COUN[3])
运行结果:
希望可以帮到以后和我遇到一样问题的小旁友~
0
1
2
3
4
作者:83748wuw
链接: https://www.pythonheidong.com/blog/article/147592/67f0dc00a08760291750/
来源: python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
Copyright © 2018-2019 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系z452as@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!