stateweath.py 5.98 KB
Newer Older
lcn's avatar
lcn committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
import os
import urllib.request
import pandas as pd
import numpy as np

import urllib.request
import re
from bs4 import BeautifulSoup


def crawlpm_other():
    url1 = "http://www.pm25china.net/shenzhen/"
    posiState = {}
    url = urllib.request.urlopen(url1)
    object = BeautifulSoup(url,  features='lxml')
    names = ["横岗", "民治", "莲花", "南海子站", "通心岭子站", "荔园", "洪湖",
             "华侨城", "南油", "盐田", "龙岗", "西乡", "南澳", "葵涌", "梅沙", "观澜"]
    shuju = object.findAll("div", {"class": "weilai"})
    shuju = str(shuju).split("<tr>")
    i = 0
    for eachline in shuju:
        if "href" in eachline:
            eachval = eachline.split("<td>")
            posiState[names[i]] = [int(eachval[2].replace(
                "</td>\n", "")), int(eachval[4].replace("</td>\n", "")),
                int(eachval[5].replace("</td>\n", ""))]
            i = i + 1
    return posiState


def crawlpm():
    try:
        url1 = "http://www.86pm25.com/city/shenzhen.html"
        posiState = {}
        url = urllib.request.urlopen(url1)
        text = url.read().decode("utf-8")
        shuju = re.findall('<tr><td>(.*?)</td></tr>', text, re.S)
        renewTime = re.findall("""<div class="remark">(.*?)</div>""", text,
                               re.S)
        data_time = renewTime[0][3:].split(" ")
        for each in shuju:
            eachsplit = each.split("</td><td>")
            eachsplit.pop(2)
            eachsplit.append(data_time[1])
            posiState[eachsplit[0]] = [int(eachsplit[1:][each]) if each == 0
                                                                   and
                                                                   eachsplit[1:][
                                                                       0] not in
                                                                   ['—', '']
                                       else int(
                eachsplit[1:][each][:-5]) if each == 1 and eachsplit[1:][1]
                                             not in ['—', '']
            else int(eachsplit[1:][each][:-5]) if each == 2 and eachsplit[1:][
                2] not in ['—', '']
            else int(eachsplit[1:][each][:-1]) if each == 3 and eachsplit[
                                                                1:][3]
                                                  not in
                                                  ['—', ''] else 25 for each in
                                       range(4)]
    except Exception as e:
        posiState = crawlpm_other()
    return posiState


def downscal():
    pm_data = crawlpm()
    origin_path = os.path.dirname(__file__)
    file_path = os.path.join(origin_path, "libs/cejwdwww10000.csv")
    tudes = pd.read_csv(file_path)
    names = ["观澜", "横岗", "洪湖", "华侨城", "葵涌", "莲花", "龙岗", "梅沙", "民治", "南澳",
             "南海子站", "坪山",
             "通心岭子站", "西乡", "盐田"]
    output = tudes[["lon", "lat"]]
    output_new = pd.DataFrame({"lon": [], "lat": [], "pm25": [], "pm10": []})
    # output_new = pd.DataFrame({"lon": [], "lat": [], "pm25": []})
    headers = ["gl_euc_dis_w", "hg_euc_dis_w", "hh_euc_dis_w", "hqc_euc_dis_w",
               "cy_euc_dis_w", "lh_euc_dis_w", "lg_euc_dis_w", "ms_euc_dis_w",
               "mz_euc_dis_w", "na_euc_dis_w", "nhzz_euc_dis_w",
               "ps_euc_dis_w",
               "txl_euc_dis_w", "xx_euc_dis_w", "yt_euc_dis_w"]
    for scale_type in ["pm2.5", "pm10"]:
        heads = []
        if scale_type == "pm2.5":
            inter_pm = []
            for name in range(len(names)):
                try:
                    inter_pm.append(pm_data[names[name]][1])
                    heads.append(headers[name])
                except:
                    continue
            pm25 = np.array(inter_pm).T
        elif scale_type == "pm10":
            inter_pm = []
            for name in range(len(names)):
                try:
                    inter_pm.append(pm_data[names[name]][2])
                    heads.append(headers[name])
                except:
                    continue
            pm25 = np.array(inter_pm).T
        elif scale_type == "AIQ":
            inter_pm = []
            for name in range(len(names)):
                try:
                    inter_pm.append(pm_data[names[name]][1])
                    heads.append(headers[name])
                except:
                    continue
            pm25 = np.array(inter_pm).T

        dis_w_data = tudes[heads]
        dis_w_data = dis_w_data.values
        weight_sum = dis_w_data.dot(pm25)
        w_sum = dis_w_data.sum(1)
        pm25_new = [weight_sum[each] / w_sum[each] for each in
                    range(len(w_sum))]
        output["pm25_new"] = pm25_new
        lons = output["lon"].values.tolist()
        lats = output["lat"].values.tolist()
        pm25s = output["pm25_new"].values.tolist()
        output_new["lon"] = lons
        output_new["lat"] = lats
        if scale_type == "pm2.5":
            output_new["pm25"] = pm25s
        elif scale_type == "pm10":
            output_new["pm10"] = pm25s

    return output_new


POSISTATE = {
    "观澜": [114.06414, 22.716897], "横岗": [114.211036, 22.644685],
    "洪湖": [114.122211, 22.561406], "华侨城": [113.992091, 22.543768],
    "葵涌": [114.429241, 22.634623], "莲花": [114.062445, 22.558013],
    "龙岗": [114.28, 22.715], "梅沙": [114.31842, 22.60319],
    "民治": [114.041814, 22.635083], "南澳": [114.551, 22.511],
    "南海子站": [113.929068, 22.505656], "坪山": [114.351338, 22.693845],
    "通心岭子站": [114.10407, 22.553989], "西乡": [113.857, 22.634],
    "盐田": [114.275, 22.592]
}

if __name__ == '__main__':
    res = crawlpm()
    print(res)
    print(len(res))
    print(res.keys())
    res2 = crawlpm1()
    print(res2)
    print(len(res2))
    print(res2.keys())
    # corsp_val = res[(res["lon"] == 113.9443959) & (res["lat"] == 22.5126945)]
    # print(corsp_val)
    # s = crawlpm()
    # print(s)