qdyxmas's Room

Done is better than perfect...

获取城镇信息

#!/usr/bin/python
# coding=utf-8
import time

from bs4 import BeautifulSoup
import os
import sys
import json
import re
try:
    from urlparse import urljoin
except ImportError:
    from urllib.parse import urljoin

from requests_html import HTMLSession
def cur_file_dir():
     #获取脚本路径
     path = sys.path[0]
     if os.path.isdir(path):
         return path
     elif os.path.isfile(path):
         return os.path.dirname(path)
def get_now(key):
    now = int(time.time())
    timeArray  = time.localtime(now)
    ret = time.strftime("%Y%m%d%H%M%S", timeArray)
    return ret+"{}.json".format(key)
path = cur_file_dir()
class GetCities():
    def __init__(self,base_url=''):
        if not base_url:
            base_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/"
        self.base_url = base_url
        self.headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
        }
        self.session = HTMLSession()
    def get_provinces(self):
        all_provinces_dict = {}
        r = self.session.get(self.base_url,headers=self.headers)
        all_provinces =  r.html.pq(".provincetr")
        provinces_len = len(all_provinces)
        for x in range(provinces_len):
            province_html = all_provinces.eq(x).html()
            s = BeautifulSoup(province_html)
            all_list = s.find_all("a")
            for p in all_list:
                province_name = p.text
                province_code = p["href"]
                all_provinces_dict[province_code] = {"name":province_name,"code":province_code}
        self.provinces_dict = all_provinces_dict
        return all_provinces_dict
    def get_cities(self):
        # all_cities = self.read_json("市.json")
        # self.all_cities = all_cities
        # return all_cities
        all_cities = {}
        for k in ["43.html"]:
        # for k,v in self.provinces_dict.items():
            url = self.base_url + k
            # print("url=",url)
            req = self.session.get(url,headers=self.headers)
            cities_dict = self.parser_data("citytr",req)
            all_cities[k] = cities_dict
        self.all_cities = all_cities
        # print("all_cities=",all_cities)
        return all_cities
    def get_counties(self,**kargs):
        """获取县"""
        # all_counties = self.read_json("县.json")
        # self.all_counties = all_counties
        # return all_counties
        all_counties = {}
        for k,v in self.all_cities.items():
            all_counties[k] = {}
            for k1,v1 in v.items():
                all_counties[k][k1] = {}
                url = self.base_url + k1
                req = self.session.get(url,headers=self.headers)
                #这里的k[:-5]可能会有问题
                counties_dict = self.parser_data("countytr",req,k[:-5]+"/")
                all_counties[k][k1] = counties_dict
        self.all_counties = all_counties
        return all_counties
    def parser_data(self,class_name,reqhtml,province_code=""):
        html = reqhtml.html.pq(".{}".format(class_name))
        html_len = len(html)
        return_list = {}
        # print("html_len=",html_len)
        for x in range(html_len):
            sub_html = html.eq(x).html()
            if sub_html.startswith("<tr") =="False:" sub_html="&lt;tr&gt;" +sub_html+"<="" tr="">"
            # print("sub_html=",sub_html)
            s = BeautifulSoup(sub_html)
            all_list = s.find_all("a")
            #每2个一组
            all_len = len(all_list)
            for index in range(0,all_len,2):
                code = all_list[index].text
                name = all_list[index+1].text
                link = province_code+all_list[index]["href"]
                link = link.replace("//","/")
                # return_list.append([province_code+link,name,code])
                return_list[link] = {}
                return_list[link]["name"] = name
                return_list[link]["code"] = code
        return return_list
    def parser_villagetr(self,class_name,reqhtml,province_code=""):
        html = reqhtml.html.pq(".{}".format(class_name))
        all_len_ = len(html)
        return_list = {}
        for x in range(all_len_):
            sub_html = html.eq(x).html()
            if sub_html.startswith("<tr") =="False:" sub_html="&lt;tr&gt;" +sub_html+"<="" tr="">"
            # print("sub_html=",sub_html)
            s = BeautifulSoup(sub_html)
            all_list = s.find_all("td")
            #每2个一组
            all_len = len(all_list)
            for index in range(0,all_len,3):
                code = all_list[index].text
                vill_code = all_list[index+1].text
                name = all_list[index+2].text
                # return_list.append([province_code+link,name,code])
                return_list[code] = {}
                return_list[code]["name"] = name
                return_list[code]["code"] = code
                # return_list[code]["vill_code"] = vill_code
        return return_list
    def get_towns(self):
        """获取镇"""
        # all_towns = self.read_json("镇.json")
        # self.all_towns = all_towns
        # return all_towns
        all_towns = {}
        for k,v in self.all_counties.items():
        # for k,v in self.all_counties.items():
            all_towns[k] = {}
            for k1,v1 in v.items():
                all_towns[k][k1] = {}
                for k2,v2 in v1.items():
                    prefix = re.sub("([0-9]+\.html)","",k2)
                    all_towns[k][k1][k2] = {}
                    url = self.base_url + k2
                    req = self.session.get(url,headers=self.headers)
                    all_towns_dict = self.parser_data("towntr",req,prefix+"/")
                    all_towns[k][k1][k2] = all_towns_dict
        self.all_towns = all_towns
        return all_towns
    def get_villagetr(self,*kargs):
        """获取村委会"""
        # with open("村.json",'r',encoding='utf-8') as load_f:
            # all_villagetr = json.load(load_f)
        # self.all_villagetr = all_villagetr
        # return all_villagetr
        all_villagetr = {}
        for k,v in villagetr.items():
        # for k,v in self.villagetr.items():
            all_villagetr[k] = {}
            for k1,v1 in v.items():
                all_villagetr[k][k1] = {}
                for k2,v2 in v1.items():
                    all_villagetr[k][k1][k2] = {}
                    for k3,v3 in v2.items():
                        prefix = re.sub("([0-9]+\.html)","",k3)
                        all_villagetr[k][k1][k2][k3] = {}
                        url = self.base_url + k3
                        # time.sleep(0.3)
                        # print("url=",url)
                        req = self.session.get(url,headers=self.headers,timeout=None)
                        all_villagetr_dict = self.parser_villagetr("villagetr",req)
                        all_villagetr[k][k1][k2][k3] = all_villagetr_dict
                        # print(all_villagetr_dict)
        self.all_villagetr = all_villagetr
        return all_villagetr
    def write_msg(self,key="43.html"):
        all_dict = {}
        #得到省
        provice_name = self.provinces_dict[key]["name"]
        provice_code = self.provinces_dict[key]["code"]
        p_key = provice_name+"("+provice_code+")"
        all_dict[p_key] = {}
        #得到市节点
        cityes_dict = self.all_cities[key]
        for k,v in cityes_dict.items():
            print("k=",k)
            print("shi_v=",v)
            shi_name = v['name']
            shi_code = v['code']
            p_shi = shi_name+"("+shi_code+")"
            all_dict[p_key][p_shi]={}
            all_counties = self.all_counties[key][k]
            print("all_counties=",all_counties)
            for k1,v1 in all_counties.items():
                # print("v")
                print("xian_v1=",v1)
                xian_name = v1['name']
                xian_code = v1['code']
                p_xian = xian_name+"("+xian_code+")"
                all_dict[p_key][p_shi][p_xian]={}
                all_towns = self.all_towns[key][k][k1]
                print("all_towns=",all_towns)
                for k2,v2 in all_towns.items():
                    if not isinstance(v2,dict):
                        break
                    else:
                        # print("zhen v2=",v2)
                        town_name = v2['name']
                        town_code = v2['code']
                        p_town = town_name+"("+town_code+")"
                        all_dict[p_key][p_shi][p_xian][p_town]={}
                        all_villagetr = self.all_villagetr[key][k][k1][k2]
                        for k3,v3 in all_villagetr.items():
                            if not isinstance(v3,dict):
                                break
                            else:
                                # print("zhen v2=",v3)
                                villagetr_name = v3['name']
                                villagetr_code = v3['code']
                                villagetr_vill_code = v3['vill_code']
                                p_villagetr = villagetr_name+"("+villagetr_code+")"
                                all_dict[p_key][p_shi][p_xian][p_town][p_villagetr]=villagetr_vill_code
        # print(all_dict)
        return all_dict
    def read_json(self,filename):
        with open(filename,'r',encoding='utf-8') as load_f:
            ret = json.load(load_f)
        return ret
def write_to_json(key,**tcase):
    curtime = get_now(key)
    filename = os.path.join(path,curtime)
    with open(filename,'w',encoding='utf-8') as json_file:
        json.dump(tcase,json_file,ensure_ascii=False)
if __name__ == "__main__":
    c=GetCities()
    all = c.get_provinces()
    all_cities = c.get_cities()
    write_to_json("市",**all_cities)
    all_counties= c.get_counties()
    write_to_json("县",**all_counties)
    all_towns= c.get_towns()
    write_to_json("镇",**all_towns)
    all_vill = c.get_villagetr()
    write_to_json("村",**all_vill)
    hunan = c.write_msg()
    write_to_json("湖南",**hunan)
    # print(all_cities)
    # print(all_towns)

标签 

106 评论

  1. Savannah
    /回复

    Definitely believe that which you said. Your favorite justification seemjed to bee on the web the easiest thing to be aware of. I say to you, I certainly get annoyed whjile people consider worries that they plainly don't know about. You managed to hit the nail upon the top as well as defined out the whole thing without having side-effects , people couyld take a signal. Will probably be back to get more. Thanks виагра web page анаболические препараты

  2. Dorothy
    /回复

    Thank you for the good writeup. It in fact was a amusement account it. https://lascootershop.ca/en/pourquoi-acheter-un-scooter-electrique/

  3. Internet
    /回复

    Hola! I've been reading your web site for a while now and finally got the bravery to go ahead and give you a shout out from Atascocita Texas! Just wanted to mention keep up the fantastic job! https://tutpub.com/entertainment/%D9%88%D9%88%D8%B1%D8%AF%D8%A8%D8%B1%D9%8A%D8%B3-%D8%B3%D9%8A%D9%88-%D9%85%D8%A7%D8%B3%D8%AA%D8%B1/

  4. Jose
    /回复

    I like it when folks come together and share ideas. Great website, keep it up! https://www.nudermacosmetique.com/pourquoi-ma-peau-a-besoin-dhydratation-apres-lete/

  5. Frederick
    /回复

    Everyone loves it when folks come together and share ideas. Great blog, stick with it! https://www.nudermacosmetique.com

  6. 온라인카지노
    /回复

    Thanks , I've just been looking for info approximately this subject for ages and yours is the greatest I've found out so far. However, what in regards to the conclusion? Are you certain concerning the supply? http://wd.ua/user/moran25guldager/

  7. divi wordpress theme
    /回复

    Pretty section of content. I simply stumbled upon your weblog and in accession capital to say that I get in fact enjoyed account your blog posts. Any way I'll be subscribing to your augment and even I success you get entry to consistently quickly. https://uberant.com/article/963288-what-is-the-wordpress-blogging-system/

  8. gestyy.com
    /回复

    I'm really loving the theme/design of your website. Do you ever run into any browser compatibility problems? A small number of my blog readers have complained about my website not operating correctly in Explorer but looks great in Firefox. Do you have any advice to help fix this problem? http://gestyy.com/eeeBTM

  9. Elle & Lui
    /回复

    A person essentially help to make significantly posts I would state. This is the very first time I frequented your web page and to this point? I amazed with the analysis you made to create this actual post incredible. Magnificent activity!| Heya i?m for the primary time here. https://lapimbeche.com/collections/stickers

  10. Elle & Lui
    /回复

    A person essentially assist to make seriously posts I would state. This is the very first time I frequented your web page and so far? I surprised with the analysis you made to make this particular publish incredible. Magnificent task!| Heya i am for the primary time here. https://lapimbeche.com/collections/keychain

  11. قصة عشق زهرة تالوت
    /回复

    Hello, I enjoy reading through your post. I wanted to write a little comment to support you. https://www.scutify.com/profiles/drama07024

  12. Johnette
    /回复

    I love it when individuals get together and share views. Great blog, continue the good work! https://www.lyndahoffman.com/index.html

  13. lascootershop
    /回复

    Thank you for the good writeup. It in fact was a amusement account it. https://lascootershop.ca/les-quatre-avantages-des-scooters-electriques-que-vous-devez-connaitre/

  14. Gwen
    /回复

    I hope to give something back and help others like you aided me. https://aklexterminateur.com/comment-se-debarrasser-des-fourmis-charpentieres-a-la-maison/

  15. Maik
    /回复

    Great article, totally what I needed. Avanafil 200mg web site beste online apotheke

  16. orbita sparta
    /回复

    Ahaa, its fastidious dialogue regarding this post at this place at this blog, I have read all that, so at this time me also commenting at this place. http://slcenvironmental.ca/how-to-build-a-concrete-pool/

  17. bulk chicken snack sticks
    /回复

    Hi! I've been following your web site for a long time now and finally got the bravery to go ahead and give you a shout out from Atascocita Texas! Just wanted to tell you keep up the good work! http://l.addersuq@lulle.sakura.ne.jp/cgi-bin/kemobook/g_book.cgi/g_book.cgi

  18. 온라인카지노
    /回复

    It's the best time to make some plans for the long run and it's time to be happy. I've learn this post and if I could I want to counsel you some interesting things or tips. Perhaps you could write subsequent articles regarding this article. I want to learn even more things about it! https://gumroad.com/7361843786318/p/blackjack-casino-rules-and-bets-a2b901d5-8116-4739-a2ab-097c920287f2

  19. swimming stores near me
    /回复

    It's amazing in favor of me to have a website, which is useful in support of my know-how. thanks admin http://s-kingdom.s34.xrea.com/cgi-bin/peoples/scr/scr.cgi

  20. Felicitas
    /回复

    Hi my loved one! I wish to say that this article is awesome, nice written and come with almost all vital infos. I would like to see extra posts like this . взять кредит под залог web site деньги под залог авто киев

  21. Geam Bova Lexio 2008
    /回复

    Hi, I do think this is a great site. I stumbledupon it ;) I'm going to revisit yet again since I book marked it. Money and freedom is the greatest way to change, may you be rich and continue to guide others. https://anunturi-parbrize.ro/index.php?cauta=geam+bova

  22. 카지노사이트
    /回复

    Hey there! I know this is kinda off topic nevertheless I'd figured I'd ask. Would you be interested in exchanging links or maybe guest writing a blog article or vice-versa? My site goes over a lot of the same topics as yours and I believe we could greatly benefit from each other. If you are interested feel free to send me an email. I look forward to hearing from you! Wonderful blog by the way! https://sites.google.com/view/top-ten-betting-mistakes-/home

  23. Shawn
    /回复

    Exccellent post. I was checking continuously this blog and I am impressed! Extremely helpful info particularly the last part : ) Icare for such information a lot. I was seeking thiks particular information for a vefy long time. Thank you and good luck. посуда купить киев web page полотенца бумажные

  24. seo services.com
    /回复

    Whats up are using Wordpress for your site platform? I'm new to the blog world but I'm trying to get started and create my own. Do you require any html coding knowledge to make your own blog? Any help would be really appreciated! https://www.wattpad.com/user/barron8965

  25. максим иванчук пластический хирург отзывы
    /回复

    What i don't realize is actuallpy how you are now not realpy much more well-liked than you may be now. You're so intelligent. You recognize therefore significantly with regards to this subject, made me for my part imagine it from so many varied angles. Its like men and woomen aren't innterested unless it is something to accomplish with Girl gaga! Your individual stuffs nice. Always take care of it up! https://www.otzyvua.net/ivanchuk-maksim-sergeevich максим иванчук пластический хирург отзывы максим иванчук пластический хирург отзывы https://www.medcentre.com.ua/vrachi/ivanchuk-maksim-sergeevich.html

  26. seo services godaddy
    /回复

    Your method of explaining the whole thing in this article is truly good, all be able to simply be aware of it, Thanks a lot. http://www.webestools.com/profile-298318.html

  27. aboutme
    /回复

    I know this website gives quality based posts and other data, is there any other site which offers these things in quality? https://forums.ubisoft.com/member.php/6021350-barron8965?tab=aboutme

  28. RockWrist Wrist Wraps for CrossFit WOD Weight Training
    /回复

    Greetings from Ohio! I'm bored to death at work so I decided to check out your site on my iphone during lunch break. I love the knowledge you provide here and can't wait to take a look when I get home. I'm shocked at how fast your blog loaded on my phone .. I'm not even using WIFI, just 3G .. Anyways, amazing blog! http://www.stoneville.fi/pihakivetykset/piha/

  29. Seattle Mariners Merchandise
    /回复

    I could not resist commenting. Perfectly written! http://www.keralalaughteryoga.com/kerala_laughter_yoga_1/

  30. Denny
    /回复

    Remarkable things here. I am very glad to see your article. Thank you so much aand I am looking forward to contact you. Will you please drop me a mail? Best 90s songs web site best classic books

评论