发布于2019-08-05 18:21 阅读(1106) 评论(0) 点赞(1) 收藏(4)
import re
import requests
from lxml.html import etree
url_xpath = '//dd/p[1]/a[1]/@href'
title_xpath = '//dd/p[1]/a[1]/@title'
data_xpaht = '//dd/p[2]/text()'
headers = {
'rpferpr': 'https://sh.zu.fang.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
}
rp = requests.get('https://sh.zu.fang.com/', headers=headers)
rp.encoding = rp.apparent_encoding
html = etree.HTML(rp.text)
url = html.xpath(url_xpath)
title = html.xpath(title_xpath)
data = re.findall('<p class="font15 mt12 bold">(.*?)</p>', rp.text, re.S)
mold_lis = []
house_type_lis = []
area_lis = []
for a in data:
a = re.sub('�O', '平方米', a)
mold = re.findall('\r\n\s.*?(\S.*?)<span class="splitline">', a)
house_type_area = re.findall('</span>(.*?)<span class="splitline">', a)
try:
mold_lis.append(mold[0])
house_type_lis.append(house_type_area[0])
area_lis.append(house_type_area[1])
except:
pass
data_zip = zip(title, url, mold_lis, house_type_lis, area_lis)
with open('info.txt', 'a', encoding='utf8') as fa:
for a in data_zip:
fa.write(str(a))
fa.write('\n')
未完待续
后续接着对于分区进行爬取
arpa_dict = {
'不限':'house',
'浦东':'house-a025',
'嘉定':'house-a029',
'宝山':'house-a030',
'闵行':'house-a018',
'松江':'house-a0586',
'普陀':'house-a028',
'静安':'house-a021',
'黄浦':'house-a024',
'虹口':'house-a024',
'青浦':'house-a024',
'奉贤':'house-a024',
'金山':'house-a024',
'杨浦':'house-a024',
'徐汇':'house-a024',
'长宁':'house-a024',
'崇明':'house-a0996',
'上海周边':'house-a01046',
}
作者:23hdsdh
链接:https://www.pythonheidong.com/blog/article/6204/5a49c82cc038d5c52e7f/
来源:python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!