成功最有效的方法就是向有经验的人学习!

python爬安居客租房信息

穷人没办法,为租房而奔波,找房实在太难,数据爬下来用EXCEL分析,过滤无效信息,有需要拿去
file

import requests
import bs4
import json
import time
import os
from lxml import etree

class spider(object):
    def __init__(self):
        self.url = "https://sh.zu.anjuke.com/?from=navigation"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }

    def get_cities(self):
        response = requests.get(self.url, headers=self.headers, timeout=30).text
        html = etree.HTML(response)
        city_list = html.xpath('//div[@id="city_list"]//dd/a')
        cities = {}
        for item in city_list:
            city_url = item.xpath('./@href')[0]
            city_name = item.xpath('./@title')[0]
            cities[city_name] = city_url
            if not os.path.exists('./data/' + city_name):
                os.makedirs('./data/' + city_name)
            else:
                pass
        return cities

    def search_addrs(self, url):
        response = requests.get(url, headers=self.headers, timeout=20).text
        html = etree.HTML(response)
        addrs = html.xpath('//div[@class="sub-items sub-level1"]//a')
        address = {}
        for item in addrs[1:-2]:
            addrs_name = item.xpath('./text()')[0]
            addrs_url = item.xpath('./@href')[0]
            address[addrs_name] = addrs_url
        return address

    def search_house(self, address_url):
        response = requests.get(address_url, headers=self.headers, timeout=20).text
        html = etree.HTML(response)
        house_list = html.xpath('//div[@id ="list-content"]/div')
        for item in house_list[2:]:
            house_url = item.xpath('./a/@href')
            house_img = item.xpath('./a/img/@src')
            house_name = item.xpath('./div[@class="zu-info"]/h3/a/b/text()')
            house_addrs = item.xpath('./div[@class="zu-info"]/address/a/text()')
            house_price = item.xpath('./div[@class="zu-side"]//b/text()')
            print(house_addrs, house_img, house_name, house_price, house_url)

if __name__ == '__main__':
    anjuke = spider()
    # cities = anjuke.get_cities()
    # for city_name in cities.keys():
    print('正在爬取')
    # url = "https://sh.zu.anjuke.com/?from=navigation"
    # address = anjuke.search_addrs(url)
    # for addrs_name in address:
    i = 0
    for page in range(50):
        address_url = 'https://sh.zu.anjuke.com/fangyuan/qingpu/x1-p' + str(page)
        anjuke.search_house(address_url)
        i += 1
        if i % 10 == 0:
            time.sleep(60)
赞(1) 打赏
未经允许不得转载:陈桂林博客 » python爬安居客租房信息
分享到

大佬们的评论 抢沙发

全新“一站式”建站,高质量、高售后的一条龙服务

微信 抖音 支付宝 百度 头条 快手全平台打通信息流

橙子建站.极速智能建站8折购买虚拟主机

觉得文章有用就打赏一下文章作者

非常感谢你的打赏,我们将继续给力更多优质内容,让我们一起创建更加美好的网络世界!

支付宝扫一扫打赏

微信扫一扫打赏

登录

找回密码

注册