stateweath.py

import os
import urllib.request
import pandas as pd
import numpy as np

import urllib.request
import re
from bs4 import BeautifulSoup


def crawlpm_other():
    url1 = "http://www.pm25china.net/shenzhen/"
    posiState = {}
    url = urllib.request.urlopen(url1)
    object = BeautifulSoup(url,  features='lxml')
    names = ["横岗", "民治", "莲花", "南海子站", "通心岭子站", "荔园", "洪湖",
             "华侨城", "南油", "盐田", "龙岗", "西乡", "南澳", "葵涌", "梅沙", "观澜"]
    shuju = object.findAll("div", {"class": "weilai"})
    shuju = str(shuju).split("<tr>")
    i = 0
    for eachline in shuju:
        if "href" in eachline:
            eachval = eachline.split("<td>")
            posiState[names[i]] = [int(eachval[2].replace(
                "</td>\n", "")), int(eachval[4].replace("</td>\n", "")),
                int(eachval[5].replace("</td>\n", ""))]
            i = i + 1
    return posiState


def crawlpm():
    try:
        url1 = "http://www.86pm25.com/city/shenzhen.html"
        posiState = {}
        url = urllib.request.urlopen(url1)
        text = url.read().decode("utf-8")
        shuju = re.findall('<tr><td>(.*?)</td></tr>', text, re.S)
        renewTime = re.findall("""<div class="remark">(.*?)</div>""", text,
                               re.S)
        data_time = renewTime[0][3:].split(" ")
        for each in shuju:
            eachsplit = each.split("</td><td>")
            eachsplit.pop(2)
            eachsplit.append(data_time[1])
            posiState[eachsplit[0]] = [int(eachsplit[1:][each]) if each == 0
                                                                   and
                                                                   eachsplit[1:][
                                                                       0] not in
                                                                   ['—', '']
                                       else int(
                eachsplit[1:][each][:-5]) if each == 1 and eachsplit[1:][1]
                                             not in ['—', '']
            else int(eachsplit[1:][each][:-5]) if each == 2 and eachsplit[1:][
                2] not in ['—', '']
            else int(eachsplit[1:][each][:-1]) if each == 3 and eachsplit[
                                                                1:][3]
                                                  not in
                                                  ['—', ''] else 25 for each in
                                       range(4)]
    except Exception as e:
        posiState = crawlpm_other()
    return posiState


def downscal():
    pm_data = crawlpm()
    origin_path = os.path.dirname(__file__)
    file_path = os.path.join(origin_path, "libs/cejwdwww10000.csv")
    tudes = pd.read_csv(file_path)
    names = ["观澜", "横岗", "洪湖", "华侨城", "葵涌", "莲花", "龙岗", "梅沙", "民治", "南澳",
             "南海子站", "坪山",
             "通心岭子站", "西乡", "盐田"]
    output = tudes[["lon", "lat"]]
    output_new = pd.DataFrame({"lon": [], "lat": [], "pm25": [], "pm10": []})
    # output_new = pd.DataFrame({"lon": [], "lat": [], "pm25": []})
    headers = ["gl_euc_dis_w", "hg_euc_dis_w", "hh_euc_dis_w", "hqc_euc_dis_w",
               "cy_euc_dis_w", "lh_euc_dis_w", "lg_euc_dis_w", "ms_euc_dis_w",
               "mz_euc_dis_w", "na_euc_dis_w", "nhzz_euc_dis_w",
               "ps_euc_dis_w",
               "txl_euc_dis_w", "xx_euc_dis_w", "yt_euc_dis_w"]
    for scale_type in ["pm2.5", "pm10"]:
        heads = []
        if scale_type == "pm2.5":
            inter_pm = []
            for name in range(len(names)):
                try:
                    inter_pm.append(pm_data[names[name]][1])
                    heads.append(headers[name])
                except:
                    continue
            pm25 = np.array(inter_pm).T
        elif scale_type == "pm10":
            inter_pm = []
            for name in range(len(names)):
                try:
                    inter_pm.append(pm_data[names[name]][2])
                    heads.append(headers[name])
                except:
                    continue
            pm25 = np.array(inter_pm).T
        elif scale_type == "AIQ":
            inter_pm = []
            for name in range(len(names)):
                try:
                    inter_pm.append(pm_data[names[name]][1])
                    heads.append(headers[name])
                except:
                    continue
            pm25 = np.array(inter_pm).T

        dis_w_data = tudes[heads]
        dis_w_data = dis_w_data.values
        weight_sum = dis_w_data.dot(pm25)
        w_sum = dis_w_data.sum(1)
        pm25_new = [weight_sum[each] / w_sum[each] for each in
                    range(len(w_sum))]
        output["pm25_new"] = pm25_new
        lons = output["lon"].values.tolist()
        lats = output["lat"].values.tolist()
        pm25s = output["pm25_new"].values.tolist()
        output_new["lon"] = lons
        output_new["lat"] = lats
        if scale_type == "pm2.5":
            output_new["pm25"] = pm25s
        elif scale_type == "pm10":
            output_new["pm10"] = pm25s

    return output_new


POSISTATE = {
    "观澜": [114.06414, 22.716897], "横岗": [114.211036, 22.644685],
    "洪湖": [114.122211, 22.561406], "华侨城": [113.992091, 22.543768],
    "葵涌": [114.429241, 22.634623], "莲花": [114.062445, 22.558013],
    "龙岗": [114.28, 22.715], "梅沙": [114.31842, 22.60319],
    "民治": [114.041814, 22.635083], "南澳": [114.551, 22.511],
    "南海子站": [113.929068, 22.505656], "坪山": [114.351338, 22.693845],
    "通心岭子站": [114.10407, 22.553989], "西乡": [113.857, 22.634],
    "盐田": [114.275, 22.592]
}

if __name__ == '__main__':
    res = crawlpm()
    print(res)
    print(len(res))
    print(res.keys())
    res2 = crawlpm1()
    print(res2)
    print(len(res2))
    print(res2.keys())
    # corsp_val = res[(res["lon"] == 113.9443959) & (res["lat"] == 22.5126945)]
    # print(corsp_val)
    # s = crawlpm()
    # print(s)