python-探索性数据分析-粮农组织分析
2021/11/30 20:40:14
本文主要是介绍python-探索性数据分析-粮农组织分析,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
python-探索性数据分析-粮农组织分析
消除饥饿,消除贫困,自然资源循环利用探索性分析案例
代码
#导入需要的常用库 import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns import numpy as np import pandas as pd import os,sys import warnings import folium import missingno as msno warnings.filterwarnings('ignore') sns.set_context("poster",font_scale=1.3) import gzip import scipy #读入数据文件(压缩格式) data=pd.read_csv('aquastat.csv.gzip',compression='gzip') print(data.head()) print(data.shape) print(data.info) # 将不重复的指标列来出来(指标,指标说明) data[['variable','variable_full']].drop_duplicates() #看一共统计多少个国家 print(data.country.nunique()) countries = data.country.unique() #看有多少个时间周期 print(data.time_period.nunique()) time_periods = data.time_period.unique() print(time_periods) mid_periods = range(1960,2017,5) #看总面积指标是否完整 data[data.variable=='total_area'].value.isnull().sum() #切片 #横截面:看一个时期内所有的国家不同指标情况 def time_slice(df,time_period): df = df[df.time_period == time_period] df = df.pivot(index='country',columns='variable',values='value') df.columns.name = time_period return df print(time_slice(data,time_periods[0]).head()) #切片 #时间序列:看一个国家,各个时间周期指标 def country_slice(df,country): df = df[df.country==country] df=df.pivot(index='variable',columns='time_period',values='value') df.index.name = country return df print(country_slice(data,countries[40]).head()) #切片 #面板数据:所有国家随时间推移,作为数据给出 def variable_slice(df,variable): df=df[df.variable==variable] df=df.pivot(index='country',columns='time_period',values='value') return df print(variable_slice(data,'total_pop').head()) #切片 #地理空间:所有地理上相互联系的国家 def time_series(df,country,variable): series = df[(df.country==country) & (df.variable==variable)] series = series.dropna()[['year_measured','value']] series.year_measured = series.year_measured.astype(int) series.set_index('year_measured',inplace=True) series.columns=[variable] return series print(time_series(data,'Belarus','total_pop')) print(data.region.unique()) #粒度太小,进行合并成较大粒度 # simple_regions = { # 'World | Asia':'Asia', # 'Americas | Central America and Caribbean | Central America': 'North America', # 'Americas | Central America and Caribbean | Greater Antilles': 'North America', # 'Americas | Central America and Caribbean | Lesser Antilles and Bahamas': 'North America', # 'Americas | Northern America | Northern America': 'North America', # 'Americas | Northern America | Mexico': 'North America', # 'Americas | Southern America | Guyana': 'South America', # 'Americas | Southern America | Andean': 'South America', # 'Americas | Southern America | Brazil': 'South America', # 'Americas | Southern America | Southern America' 'World | Africa': 'South America', # 'World | Africa':'Africa', # 'World | Europe':'Europe', # 'World | Oceania':'Oceania' # } # # data.region = data.region.apply(lambda x: simple_regions[x]) # print(data.region.unique()) def subregion(data,region): return data[data.region==region] #数据质量评估 recent= time_slice(data,'2013-2017') msno.matrix(recent,labels=True) #水资源总量 msno.matrix(variable_slice(data,'exploitable_total'),inline=False,sort='descending') plt.xlabel('Time period') plt.ylabel('Country') plt.title('Missing total exploitable water resources data cross counties and time jperiods \n \n \n \n') plt.show() #去掉缺失严重的列exploitab data=data.loc[~data.variable.str.contains('exploitab'),:] #全国降雨指数缺失统计 msno.matrix(variable_slice(data,'national_rainfall_index'), inline=False,sort='descending') plt.xlabel('Time period') plt.ylabel('Country') plt.title('Missing national rainfall index data across coutries and time periods \n \n \n') plt.show() print('************************************') # null_data=recent['agg_to_gdp'].notnull()*1 # map=folium.Map(location=[48,-102],zoom_start=2) # map.choropleth(geo_data=r'world.json', # data=null_data, # columns=['country','agg_to_gdp'], # key_no='feature.properties.name',reset=True, # fill_color='GnBu',fill_opacity=1,line_opacity=0.2, # legend_name='Missing agricultural contribution to GDP data 2013-2017') # print(map) # plt.show() #地图上统计 def plot_null_map(df,time_period,variable,legend_name=None): geo = r'world.json' ts = time_slice(df,time_period).reset_index().copy() ts[variable]=ts[variable].notnull()*1 map = folium.Map(location=[48,-102],zoom_start=2) map plt.show() map.choropleth(geo_data=geo, data=ts, columns=['country',variable], key_no='feature.properties.name',reset=True, fill_color='GnBu',fill_opacity=1,line_opacity=0.2, legend_name=legend_name if legend_name else variable) return map save_map = plot_null_map(data,'2013-2017','number_undernourished','Number undernourished is missing') save_map.save('save_map.html') #统计时间周期,不同指标变化情况,不同变量在不同时间上是否被收集 fig,ax = plt.subplots(figsize=(16,16)) sns.heatmap(data.groupby(['time_period','variable']).value.count().unstack().T,ax=ax) plt.xticks(rotation=45) plt.xlabel('Time period') plt.ylabel('Variable') plt.title('Number of countries with data reportes') plt.show() # recent[['total_pop','urban_pop','rural_pop']].describe().astype(int) #排序 recent_sort = recent.sort_values('rural_pop')[['total_pop','urban_pop','rural_pop']].head() print(recent_sort) #分析峰度,倾斜,偏度 recent[['total_pop','urban_pop','rural_pop']].apply(scipy.stats.skew) recent[['total_pop','urban_pop','rural_pop']].apply(scipy.stats.kurtosis) #看看当前数据分布 fig,ax = plt.subplots(figsize=(12,8)) ax.hist(recent.total_pop.values,bins=50) ax.set_xlabel('Total population') ax.set_ylabel('Number of contries') ax.set_title('Distrbution of population of countries 2013-2017') plt.show() #理论上,我们将分布标准差与其均值线性相关,数据对数变换 recent_log = recent[['total_pop']].apply(np.log).apply(scipy.stats.skew) print(recent_log) # 总结 学习案例
这篇关于python-探索性数据分析-粮农组织分析的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2025-01-03用FastAPI掌握Python异步IO:轻松实现高并发网络请求处理
- 2025-01-02封装学习:Python面向对象编程基础教程
- 2024-12-28Python编程基础教程
- 2024-12-27Python编程入门指南
- 2024-12-27Python编程基础
- 2024-12-27Python编程基础教程
- 2024-12-27Python编程基础指南
- 2024-12-24Python编程入门指南
- 2024-12-24Python编程基础入门
- 2024-12-24Python编程基础:变量与数据类型