qdyxmas's Room

Done is better than perfect...

获取城镇信息

#!/usr/bin/python
# coding=utf-8
import time

from bs4 import BeautifulSoup
import os
import sys
import json
import re
try:
    from urlparse import urljoin
except ImportError:
    from urllib.parse import urljoin

from requests_html import HTMLSession
def cur_file_dir():
     #获取脚本路径
     path = sys.path[0]
     if os.path.isdir(path):
         return path
     elif os.path.isfile(path):
         return os.path.dirname(path)
def get_now(key):
    now = int(time.time())
    timeArray  = time.localtime(now)
    ret = time.strftime("%Y%m%d%H%M%S", timeArray)
    return ret+"{}.json".format(key)
path = cur_file_dir()
class GetCities():
    def __init__(self,base_url=''):
        if not base_url:
            base_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/"
        self.base_url = base_url
        self.headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
        }
        self.session = HTMLSession()
    def get_provinces(self):
        all_provinces_dict = {}
        r = self.session.get(self.base_url,headers=self.headers)
        all_provinces =  r.html.pq(".provincetr")
        provinces_len = len(all_provinces)
        for x in range(provinces_len):
            province_html = all_provinces.eq(x).html()
            s = BeautifulSoup(province_html)
            all_list = s.find_all("a")
            for p in all_list:
                province_name = p.text
                province_code = p["href"]
                all_provinces_dict[province_code] = {"name":province_name,"code":province_code}
        self.provinces_dict = all_provinces_dict
        return all_provinces_dict
    def get_cities(self):
        # all_cities = self.read_json("市.json")
        # self.all_cities = all_cities
        # return all_cities
        all_cities = {}
        for k in ["43.html"]:
        # for k,v in self.provinces_dict.items():
            url = self.base_url + k
            # print("url=",url)
            req = self.session.get(url,headers=self.headers)
            cities_dict = self.parser_data("citytr",req)
            all_cities[k] = cities_dict
        self.all_cities = all_cities
        # print("all_cities=",all_cities)
        return all_cities
    def get_counties(self,**kargs):
        """获取县"""
        # all_counties = self.read_json("县.json")
        # self.all_counties = all_counties
        # return all_counties
        all_counties = {}
        for k,v in self.all_cities.items():
            all_counties[k] = {}
            for k1,v1 in v.items():
                all_counties[k][k1] = {}
                url = self.base_url + k1
                req = self.session.get(url,headers=self.headers)
                #这里的k[:-5]可能会有问题
                counties_dict = self.parser_data("countytr",req,k[:-5]+"/")
                all_counties[k][k1] = counties_dict
        self.all_counties = all_counties
        return all_counties
    def parser_data(self,class_name,reqhtml,province_code=""):
        html = reqhtml.html.pq(".{}".format(class_name))
        html_len = len(html)
        return_list = {}
        # print("html_len=",html_len)
        for x in range(html_len):
            sub_html = html.eq(x).html()
            if sub_html.startswith("<tr") =="False:" sub_html="&lt;tr&gt;" +sub_html+"<="" tr="">"
            # print("sub_html=",sub_html)
            s = BeautifulSoup(sub_html)
            all_list = s.find_all("a")
            #每2个一组
            all_len = len(all_list)
            for index in range(0,all_len,2):
                code = all_list[index].text
                name = all_list[index+1].text
                link = province_code+all_list[index]["href"]
                link = link.replace("//","/")
                # return_list.append([province_code+link,name,code])
                return_list[link] = {}
                return_list[link]["name"] = name
                return_list[link]["code"] = code
        return return_list
    def parser_villagetr(self,class_name,reqhtml,province_code=""):
        html = reqhtml.html.pq(".{}".format(class_name))
        all_len_ = len(html)
        return_list = {}
        for x in range(all_len_):
            sub_html = html.eq(x).html()
            if sub_html.startswith("<tr") =="False:" sub_html="&lt;tr&gt;" +sub_html+"<="" tr="">"
            # print("sub_html=",sub_html)
            s = BeautifulSoup(sub_html)
            all_list = s.find_all("td")
            #每2个一组
            all_len = len(all_list)
            for index in range(0,all_len,3):
                code = all_list[index].text
                vill_code = all_list[index+1].text
                name = all_list[index+2].text
                # return_list.append([province_code+link,name,code])
                return_list[code] = {}
                return_list[code]["name"] = name
                return_list[code]["code"] = code
                # return_list[code]["vill_code"] = vill_code
        return return_list
    def get_towns(self):
        """获取镇"""
        # all_towns = self.read_json("镇.json")
        # self.all_towns = all_towns
        # return all_towns
        all_towns = {}
        for k,v in self.all_counties.items():
        # for k,v in self.all_counties.items():
            all_towns[k] = {}
            for k1,v1 in v.items():
                all_towns[k][k1] = {}
                for k2,v2 in v1.items():
                    prefix = re.sub("([0-9]+\.html)","",k2)
                    all_towns[k][k1][k2] = {}
                    url = self.base_url + k2
                    req = self.session.get(url,headers=self.headers)
                    all_towns_dict = self.parser_data("towntr",req,prefix+"/")
                    all_towns[k][k1][k2] = all_towns_dict
        self.all_towns = all_towns
        return all_towns
    def get_villagetr(self,*kargs):
        """获取村委会"""
        # with open("村.json",'r',encoding='utf-8') as load_f:
            # all_villagetr = json.load(load_f)
        # self.all_villagetr = all_villagetr
        # return all_villagetr
        all_villagetr = {}
        for k,v in villagetr.items():
        # for k,v in self.villagetr.items():
            all_villagetr[k] = {}
            for k1,v1 in v.items():
                all_villagetr[k][k1] = {}
                for k2,v2 in v1.items():
                    all_villagetr[k][k1][k2] = {}
                    for k3,v3 in v2.items():
                        prefix = re.sub("([0-9]+\.html)","",k3)
                        all_villagetr[k][k1][k2][k3] = {}
                        url = self.base_url + k3
                        # time.sleep(0.3)
                        # print("url=",url)
                        req = self.session.get(url,headers=self.headers,timeout=None)
                        all_villagetr_dict = self.parser_villagetr("villagetr",req)
                        all_villagetr[k][k1][k2][k3] = all_villagetr_dict
                        # print(all_villagetr_dict)
        self.all_villagetr = all_villagetr
        return all_villagetr
    def write_msg(self,key="43.html"):
        all_dict = {}
        #得到省
        provice_name = self.provinces_dict[key]["name"]
        provice_code = self.provinces_dict[key]["code"]
        p_key = provice_name+"("+provice_code+")"
        all_dict[p_key] = {}
        #得到市节点
        cityes_dict = self.all_cities[key]
        for k,v in cityes_dict.items():
            print("k=",k)
            print("shi_v=",v)
            shi_name = v['name']
            shi_code = v['code']
            p_shi = shi_name+"("+shi_code+")"
            all_dict[p_key][p_shi]={}
            all_counties = self.all_counties[key][k]
            print("all_counties=",all_counties)
            for k1,v1 in all_counties.items():
                # print("v")
                print("xian_v1=",v1)
                xian_name = v1['name']
                xian_code = v1['code']
                p_xian = xian_name+"("+xian_code+")"
                all_dict[p_key][p_shi][p_xian]={}
                all_towns = self.all_towns[key][k][k1]
                print("all_towns=",all_towns)
                for k2,v2 in all_towns.items():
                    if not isinstance(v2,dict):
                        break
                    else:
                        # print("zhen v2=",v2)
                        town_name = v2['name']
                        town_code = v2['code']
                        p_town = town_name+"("+town_code+")"
                        all_dict[p_key][p_shi][p_xian][p_town]={}
                        all_villagetr = self.all_villagetr[key][k][k1][k2]
                        for k3,v3 in all_villagetr.items():
                            if not isinstance(v3,dict):
                                break
                            else:
                                # print("zhen v2=",v3)
                                villagetr_name = v3['name']
                                villagetr_code = v3['code']
                                villagetr_vill_code = v3['vill_code']
                                p_villagetr = villagetr_name+"("+villagetr_code+")"
                                all_dict[p_key][p_shi][p_xian][p_town][p_villagetr]=villagetr_vill_code
        # print(all_dict)
        return all_dict
    def read_json(self,filename):
        with open(filename,'r',encoding='utf-8') as load_f:
            ret = json.load(load_f)
        return ret
def write_to_json(key,**tcase):
    curtime = get_now(key)
    filename = os.path.join(path,curtime)
    with open(filename,'w',encoding='utf-8') as json_file:
        json.dump(tcase,json_file,ensure_ascii=False)
if __name__ == "__main__":
    c=GetCities()
    all = c.get_provinces()
    all_cities = c.get_cities()
    write_to_json("市",**all_cities)
    all_counties= c.get_counties()
    write_to_json("县",**all_counties)
    all_towns= c.get_towns()
    write_to_json("镇",**all_towns)
    all_vill = c.get_villagetr()
    write_to_json("村",**all_vill)
    hunan = c.write_msg()
    write_to_json("湖南",**hunan)
    # print(all_cities)
    # print(all_towns)

标签 

805 评论

  1. Lino
    /回复

    sildenafil citrate 100mg pills best viagra pills https://www.water-handbook.com/index.php?title=Black_Viagra_21383 - sildenafil 20 mg tablet coupon where can i buy viagra online safely where can you buy generic viagra buy online viagra capsules

  2. JosephAmorb
    /回复

    тягачи цена http://deti.taminfo.ru/index.php?subaction=userinfo&user=uguhipeg купить сцепку продажа тягачей на авито по россии

  3. Herbertjoymn
    /回复

    сервер раскрытия информации интерфакс http://www.spvzlet.ru/forum/messages/forum2/topic5976/message9722/?result=new#message9722 при выключении компьютера вся информация стирается конфиденциальность информации это информация для человека это http://kolokolchik237.ru/index.php?subaction=userinfo&user=igixoguno устройство вывода информации росреестр справочная информация по объектам недвижимости онлайн

  4. BillyJef
    /回复

    доступность информации это https://subscribe.ru/digest/cookery/salad/n369674964.html хранение информации средства защиты информации

  5. MartyKek
    /回复

    при отключении компьютера информация обработка информации это http://ya.listbb.ru/viewtopic.php?f=14&t=217&p=1001 информация это в информатике

  6. BillyJef
    /回复

    справочная информация росреестра http://baby-best.ru/forum/topic_39577/1 алфавитный подход к измерению информации информация и информационные процессы

  7. MartyKek
    /回复

    способы передачи информации какое устройство обладает наибольшей скоростью обмена информацией http://ya.iboards.ru/viewtopic.php?f=13&t=356&p=1205 материальным носителем наследственной информации в клетке является

  8. new homes minnesota
    /回复

    It's the best time to make some plans for the future and it's time to be happy. I've read this post and if I could I wish to suggest you few interesting things or suggestions. Maybe you could write next articles referring to this article. I wish to read even more things about it! https://www.eternityhomesllc.com/

  9. Michaelemoth
    /回复

    The most popular and convenient Cryptocurrency Exchange in 16 languages. Everything is made for people. Earning is now easier. No restrictions. Huge selection of tools Come and earn now! http://bit.ly/3bAtK2O *** Самая ТОПОВАЯ и удобная Биржа криптовалют на 16 языках. Все создано для людей. Зарабатывать теперь проще. Никаких ограничений. Огромный выбор инструментов Заходи и зарабатывай сейчас! http://bit.ly/3bAtK2O

  10. Jefferykip
    /回复

    porno kim kardashian [url=http://irshansk-rada.com.ua/]teens porno[/url] porno drunk porno drunk

  11. Jefferykip
    /回复

    free porno video <a href=http://irshansk-rada.com.ua/>pkf porno</a> lenkino porno porno free

  12. Stephenabept
    /回复

    vr porno <a href=http://king-pinup642.od.ua/kosmolot/>online porno</a> porno hardcore www porno

  13. Joshuachese
    /回复

    porno hd 1080 <a href=http://school24.if.ua/>pmv porno</a> porno hub porno webcam

  14. Charlesmop
    /回复

    loli porno <a href=https://www.rugbyclubs.info/>porno hub зеркало</a> porno vk good porno

  15. Stephenabept
    /回复

    gey porno <a href=http://king-pinup642.od.ua/kosmolot/>russian gay porno</a> porus porno porno magazine

  16. EdwardIcoma
    /回复

    porno 1080 hd <a href=https://cosmolot-24.com.ua/ua/>porno hardcore</a> porno video hd porno movies

  17. ErrolGex
    /回复

    porno family <a href=https://cosmolot-24.com.ua/ru/>vr porno</a> porno lisa ann porno big tits

  18. Joshuachese
    /回复

    porno dp <a href=http://school24.if.ua/>porno ipad</a> porno girl porno big tits

  19. Jefferykip
    /回复

    dancing bear porno <a href=http://irshansk-rada.com.ua/>porno lisa ann</a> playboy porno sasha grey porno

  20. Joshuachese
    /回复

    porno dog <a href=http://school24.if.ua/>you porno</a> mother porno porno tracker

  21. Joshuachese
    /回复

    porno iznasilovanie <a href=http://school24.if.ua/>porno comics</a> porno nd porno drunk

  22. ErrolGex
    /回复

    porno lisa ann <a href=https://cosmolot-24.com.ua/ru/>porno stars</a> porno magazine anal porno

  23. EdwardIcoma
    /回复

    porno iznasilovanie <a href=https://cosmolot-24.com.ua/ua/>kisankanna porno</a> porno hardcore porno onlayn

  24. Charlesmop
    /回复

    porno zvezda <a href=https://www.rugbyclubs.info/>sex porno film</a> dredd porno porno webcam

  25. Joshuachese
    /回复

    sasha grey porno <a href=http://school24.if.ua/>undertale porno</a> porno pictures porno film online

  26. Jefferykip
    /回复

    best porno <a href=http://irshansk-rada.com.ua/>porno ipad</a> sfm porno sex porno video

  27. Joshuachese
    /回复

    mother porno <a href=http://school24.if.ua/>hd porno</a> kisankanna porno yaoi porno

  28. RichardBut
    /回复

    porno zvezda <a href=https://brody.com.ua/>porno hub</a> porno swingers mother porno

  29. Jefferykip
    /回复

    black porno <a href=http://irshansk-rada.com.ua/>porno online</a> porno photo vk porno

  30. Michaelemoth
    /回复

    The most popular and convenient Cryptocurrency Exchange in 16 languages. Everything is made for people. Earning is now easier. No restrictions. Huge selection of tools Come and earn now! http://bit.ly/3bAtK2O *** Самая ТОПОВАЯ и удобная Биржа криптовалют на 16 языках. Все создано для людей. Зарабатывать теперь проще. Никаких ограничений. Огромный выбор инструментов Заходи и зарабатывай сейчас! http://bit.ly/3bAtK2O

评论