import os import urllib.request import pandas as pd import numpy as np import urllib.request import re from bs4 import BeautifulSoup def crawlpm_other(): url1 = "http://www.pm25china.net/shenzhen/" posiState = {} url = urllib.request.urlopen(url1) object = BeautifulSoup(url, features='lxml') names = ["横岗", "民治", "莲花", "南海子站", "通心岭子站", "荔园", "洪湖", "华侨城", "南油", "盐田", "龙岗", "西乡", "南澳", "葵涌", "梅沙", "观澜"] shuju = object.findAll("div", {"class": "weilai"}) shuju = str(shuju).split("<tr>") i = 0 for eachline in shuju: if "href" in eachline: eachval = eachline.split("<td>") posiState[names[i]] = [int(eachval[2].replace( "</td>\n", "")), int(eachval[4].replace("</td>\n", "")), int(eachval[5].replace("</td>\n", ""))] i = i + 1 return posiState def crawlpm(): try: url1 = "http://www.86pm25.com/city/shenzhen.html" posiState = {} url = urllib.request.urlopen(url1) text = url.read().decode("utf-8") shuju = re.findall('<tr><td>(.*?)</td></tr>', text, re.S) renewTime = re.findall("""<div class="remark">(.*?)</div>""", text, re.S) data_time = renewTime[0][3:].split(" ") for each in shuju: eachsplit = each.split("</td><td>") eachsplit.pop(2) eachsplit.append(data_time[1]) posiState[eachsplit[0]] = [int(eachsplit[1:][each]) if each == 0 and eachsplit[1:][ 0] not in ['—', ''] else int( eachsplit[1:][each][:-5]) if each == 1 and eachsplit[1:][1] not in ['—', ''] else int(eachsplit[1:][each][:-5]) if each == 2 and eachsplit[1:][ 2] not in ['—', ''] else int(eachsplit[1:][each][:-1]) if each == 3 and eachsplit[ 1:][3] not in ['—', ''] else 25 for each in range(4)] except Exception as e: posiState = crawlpm_other() return posiState def downscal(): pm_data = crawlpm() origin_path = os.path.dirname(__file__) file_path = os.path.join(origin_path, "libs/cejwdwww10000.csv") tudes = pd.read_csv(file_path) names = ["观澜", "横岗", "洪湖", "华侨城", "葵涌", "莲花", "龙岗", "梅沙", "民治", "南澳", "南海子站", "坪山", "通心岭子站", "西乡", "盐田"] output = tudes[["lon", "lat"]] output_new = pd.DataFrame({"lon": [], "lat": [], "pm25": [], "pm10": []}) # output_new = pd.DataFrame({"lon": [], "lat": [], "pm25": []}) headers = ["gl_euc_dis_w", "hg_euc_dis_w", "hh_euc_dis_w", "hqc_euc_dis_w", "cy_euc_dis_w", "lh_euc_dis_w", "lg_euc_dis_w", "ms_euc_dis_w", "mz_euc_dis_w", "na_euc_dis_w", "nhzz_euc_dis_w", "ps_euc_dis_w", "txl_euc_dis_w", "xx_euc_dis_w", "yt_euc_dis_w"] for scale_type in ["pm2.5", "pm10"]: heads = [] if scale_type == "pm2.5": inter_pm = [] for name in range(len(names)): try: inter_pm.append(pm_data[names[name]][1]) heads.append(headers[name]) except: continue pm25 = np.array(inter_pm).T elif scale_type == "pm10": inter_pm = [] for name in range(len(names)): try: inter_pm.append(pm_data[names[name]][2]) heads.append(headers[name]) except: continue pm25 = np.array(inter_pm).T elif scale_type == "AIQ": inter_pm = [] for name in range(len(names)): try: inter_pm.append(pm_data[names[name]][1]) heads.append(headers[name]) except: continue pm25 = np.array(inter_pm).T dis_w_data = tudes[heads] dis_w_data = dis_w_data.values weight_sum = dis_w_data.dot(pm25) w_sum = dis_w_data.sum(1) pm25_new = [weight_sum[each] / w_sum[each] for each in range(len(w_sum))] output["pm25_new"] = pm25_new lons = output["lon"].values.tolist() lats = output["lat"].values.tolist() pm25s = output["pm25_new"].values.tolist() output_new["lon"] = lons output_new["lat"] = lats if scale_type == "pm2.5": output_new["pm25"] = pm25s elif scale_type == "pm10": output_new["pm10"] = pm25s return output_new POSISTATE = { "观澜": [114.06414, 22.716897], "横岗": [114.211036, 22.644685], "洪湖": [114.122211, 22.561406], "华侨城": [113.992091, 22.543768], "葵涌": [114.429241, 22.634623], "莲花": [114.062445, 22.558013], "龙岗": [114.28, 22.715], "梅沙": [114.31842, 22.60319], "民治": [114.041814, 22.635083], "南澳": [114.551, 22.511], "南海子站": [113.929068, 22.505656], "坪山": [114.351338, 22.693845], "通心岭子站": [114.10407, 22.553989], "西乡": [113.857, 22.634], "盐田": [114.275, 22.592] } if __name__ == '__main__': res = crawlpm() print(res) print(len(res)) print(res.keys()) res2 = crawlpm1() print(res2) print(len(res2)) print(res2.keys()) # corsp_val = res[(res["lon"] == 113.9443959) & (res["lat"] == 22.5126945)] # print(corsp_val) # s = crawlpm() # print(s)