#coding=utf-8
import csv,sys
from collections import Counter
import math as mh
import pprint as pp
import numpy as np
import pandas as pd
import datetime as dt
import json
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt #导入快速绘图包函数
import pylab as pl
sys.path.append('D:\bigdatahw\python\hw3') #增加搜索路径
print sys.path #检查是路径是否存在
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False
mydata=pd.read_csv('D:/bigdatahw/python/hw3/CreditCard.csv',header=0)
mydata
mydata.head(5)
print mydata.isnull().any() #判断是否存在缺失值,可以看到只有time和code没有缺失值
mydata[mydata.isnull().values==True]#确定缺失值位置
mydata.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) #去除含有缺失值的记录
age=mydata.iloc[:,1]
age.mean(axis=0)
age=mydata['age']
mydata.describe()
#coding=utf-8
#各类用户分类汇总
count = 0 #统计各个分类的总人数
sum_age=0
avg_age = 0 #计算各个顾客群体的平均年龄
dict_count={} #用于储存各个客户分类的人数统计的字典
dict_sum_age={} #统计各个用户群体的平均年龄
for i in range(len(mydata)-1):
count = count + 1
sum_age = sum_age + float(mydata.iloc[i,1])
dict_count[mydata.iloc[i,6]]=dict_count.setdefault(mydata.iloc[i,6],0)+1
dict_sum_age[mydata.iloc[i,6]]=dict_sum_age.setdefault(mydata.iloc[i,6],0)+float(mydata.iloc[i,1]) #通过键给到键值
avg_age=float(sum_age)/(count)
print '违约记录总人数:%d'%count
print '违约记录平均年龄:%f'%avg_age
dict_avg_age={}
for key in dict_count:
dict_avg_age[key]=round(dict_sum_age[key]/dict_count[key],2)
dict_count= json.dumps(dict_count, ensure_ascii=False) #dumps是将dict转化成str格式,可以输出中文
dict_avg_age= json.dumps(dict_avg_age, ensure_ascii=False) #dumps是将dict转化成str格式,可以输出中文
print dict_count
print dict_avg_age
dict_count=eval(dict_count) #eval是将str格式转化为dict格式,便于计算
dict_avg_age=eval(dict_avg_age) #eval是将str格式转化为dict格式,便于计算
age=mydata['age'] #将年龄信息赋值给age用以画出直方图
print age[age>100] #寻找异常值
age[age>100]=100
print age.describe()
plt.title('Age Distribution') # give plot a title
plt.hist(age,alpha=0.7)
plt.xlabel('Customer Age')
plt.ylabel('count')
plt.show()
# coding: utf-8
sns.set(style="darkgrid", palette="muted", color_codes=True) #set( )设置主题,调色板更常用
fig, axes = plt.subplots(1,3,figsize=(12, 4)) #创建一个显得Figure并返回已创建的subplot的numpy数组
sns.distplot(age,ax = axes[0],bins=15,kde = False) #做年龄分布直方图
plt.title('顾客年龄分布直方图')
sns.kdeplot(age,ax = axes[1],shade=True) #做年龄分布密度曲线
plt.title('顾客年龄分布密度曲线')
pl.xlabel('age')
# 绘图:整体乘客的年龄箱线图
plt.boxplot(x = "age",
data=mydata,# 指定绘图数据
patch_artist=True, # 要求用自定义颜色填充盒形图,默认白色填充
showmeans=True, # 以点的形式显示均值
boxprops = {'color':'black','facecolor':'#9999ff'}, # 设置箱体属性,填充色和边框色
flierprops = {'marker':'o','markerfacecolor':'red','color':'black'}, # 设置异常值属性,点的形状、填充色和边框色
meanprops = {'marker':'D','markerfacecolor':'indianred'},
medianprops = {'linestyle':'--','color':'orange'}) # 设置均值点的属性,点的形状、填充色
# 设置y轴的范围
plt.ylim(0,85)
# 去除箱线图的上边框与右边框的刻度标签
plt.tick_params(top='off', right='off')
plt.title('顾客年龄分布箱线图')
pl.xlabel('age')
plt.subplots_adjust(wspace=0.8)
plt.show()
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
sns.set(style="darkgrid", palette="muted", color_codes=True)
#绘制箱线图年龄与客户分类关系
sns.boxplot(x = "code",y ="age",data=mydata, order=["顾客1", "顾客2","顾客3","顾客4","顾客5","顾客6","顾客7","顾客8","顾客9","顾客10","顾客11","顾客12"])
pl.subplots_adjust(left=0.08, right=0.95, wspace=0.25, hspace=0.45)
plt.title('顾客分类与顾客年龄箱线图')
plt.ylim(-10.0, 110.0)
plt.show()
dict_occu_count={} #用于储存各个客户分类的人数统计的字典
for i in range(len(mydata)):
dict_occu_count[mydata.iloc[i,3]]=dict_occu_count.setdefault(mydata.iloc[i,3],0)+1
dict_occu_perc={}
for key in dict_occu_count:
dict_occu_perc[key]=round((float(dict_occu_count[key])/len(age))*100,3)
perc_list = list(dict_occu_perc.values())
dict_occu_perc= json.dumps(dict_occu_perc, ensure_ascii=False) #dumps是将dict转化成str格式,可以输出中文
dict_occu_count= json.dumps(dict_occu_count, ensure_ascii=False) #dumps是将dict转化成str格式,可以输出中文
print dict_occu_perc
print dict_occu_count
dict_occu_perc=eval(dict_occu_perc) #eval是将str格式转化为dict格式,便于计算
dict_occu_count=eval(dict_occu_count) #eval是将str格式转化为dict格式,便于计算
labels = ['Agricultural','Unemployed', 'Other','technology','Retirement','Work','Housework','transportation','student','business','children','soldier']
fracs =perc_list
explode = [0.1,0.1,0.1,0.1,0,0,0,0,0,0,0,0.1] # 0.1 凸出这部分,
plt.axes(aspect=1) # set this , Figure is round, otherwise it is an ellipse
#autopct ,show percet
plt.pie(x=fracs, labels=labels, explode=explode,autopct='%3.1f %%',colors = ["blue","red","coral","green","yellow","orange"],
shadow=True, labeldistance=1.1, startangle = 90,pctdistance = 0.6)
plt.title('职业分布饼图')
'''
labeldistance,文本的位置离远点有多远,1.1指1.1倍半径的位置
autopct,圆里面的文本格式,%3.1f%%表示小数有三位,整数有一位的浮点数
shadow,饼是否有阴影
startangle,起始角度,0,表示从0开始逆时针转,为第一块。一般选择从90度开始比较好看
pctdistance,百分比的text离圆心的距离
patches, l_texts, p_texts,为了得到饼图的返回值,p_texts饼图内部文本的,l_texts饼图外label的文本
'''
plt.show()
sns.countplot(x = mydata['degree'],hue = mydata.iloc[:,0])
plt.title('违约程度男女分布')
plt.show()
sns.countplot(x = mydata['education'],hue = mydata.iloc[:,0])
plt.title('违约状况与受教育装程度条形图')
plt.show()
mydata['sex']=mydata.iloc[:,0] #添加sex变量
mydata
g = sns.FacetGrid(mydata, col= "degree", row="sex",margin_titles=True)
g = g.map(plt.hist, "age", color="r")
plt.show()
now = dt.datetime.now() #显示目前时间
type(mydata['time'])
mydata['nowtime']=dt.datetime(2017,11,1) #加入当前时间
mydata
def fun1(x):
d1=mydata['nowtime'][1]
y=x.split()[0]
y=y.replace('/','-')
d2=dt.datetime.strptime(y,'%Y-%m-%d')
d=d1-d2
return int(d.days) #定义日期间隔函数
mydata['interval']=[fun1(time) for time in mydata['time']] #计算日期间隔
mydata
end = dt.datetime(2017,11,1)
start = dt.datetime(2016,1,1)
Duration = (end-start).days
mydata[mydata['interval']<0] #找出interval中绝对值小于零的表示的是还未到违约时间和2016年1月1日之前的错误记录
mydata[mydata['interval']<0]=np.nan
mydata[mydata.isnull().values==True]#确定缺失值位置
mydata.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) #去除含有缺失值的记录
mydata[mydata['interval']<0] #确认是否还存在着未到违约时间记录
mydata[mydata['interval']>670] #找出interval中大于670天的记录,也就是2016年1月1日之前的古老记录
mydata[mydata['interval']>670]=np.nan
mydata[mydata.isnull().values==True]#确定缺失值位置
mydata.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) #去除含有缺失值的记录
mydata[mydata['interval']>670] #确认是否还存在着未到违约时间记录
mydata[mydata['age']<18] #找出interval中大于670天的记录,也就是2016年1月1日之前的古老记录
mydata[mydata['age']<18]=np.nan
mydata[mydata.isnull().values==True]#确定缺失值位置
mydata.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) #去除含有缺失值的记录
mydata[mydata['age']<18] #确认是否还存在着未到违约时间记录\
len(mydata)
mydata[mydata['occupation']=='学龄前儿童'] #找出interval中大于670天的记录,也就是2016年1月1日之前的古老记录
mydata[mydata['occupation']=='学龄前儿童']=np.nan
mydata[mydata.isnull().values==True]#确定缺失值位置
mydata.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) #去除含有缺失值的记录
mydata[mydata['occupation']=='学龄前儿童'] #确认是否还存在着未到违约时间记录\
len(mydata)
mydata[mydata['age']>150]=np.nan #将年龄大于150岁的错位记录删除
mydata.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) #去除含有缺失值的记录
mydata[mydata['age']>100] #将年龄大于100岁的记录修改为100岁
def fun2(x):
if x>100:
y=100
else:
y=age
return y
mydata['age']=[fun2(age) for age in mydata['age']]
mydata[mydata['age']==100] #查看进行修改的行
#利用信息熵进行变量筛选
"""
calculate shannon ent of x
"""
def calc_ent(x):
x_value_list = set(x)
ent = 0.0
for x_value in x_value_list:
p = float(x[x == x_value].shape[0]) / x.shape[0]
logp = np.log2(p)
ent -= p * logp
return ent
calc_ent(mydata['degree']) #计算违约程度的信息熵
#计算条件信息熵
"""
calculate ent H(y|x)
"""
def calc_condition_ent(x, y):
# calc ent(y|x)
x_value_list = set(x)
ent = 0.0
for x_value in x_value_list:
sub_y = y[x == x_value]
temp_ent = calc_ent(sub_y)
ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
return ent
calc_condition_ent(mydata['sex'], mydata['degree'])
calc_condition_ent(mydata['education'], mydata['degree'])
calc_condition_ent(mydata['occupation'], mydata['degree'])
#计算熵增信息增益
"""
calculate ent grap
"""
def calc_ent_grap(x,y):
base_ent = calc_ent(y)
condition_ent = calc_condition_ent(x, y)
ent_grap = base_ent - condition_ent
return ent_grap
calc_ent_grap(mydata['sex'],mydata['degree'])
calc_ent_grap(mydata['education'],mydata['degree'])
calc_ent_grap(mydata['occupation'],mydata['degree'])
#计算信息增益率
"""
calculate ent ratio
"""
def calc_ent_ratio(x,y):
x_value_list = set(x)
for x_value in x_value_list:
p = float(x[x == x_value].shape[0]) / x.shape[0]
logp = np.log2(p)
ent -= p * logp
ent_ratio=calc_ent_grap(x,y)/ent
return ent_ratio
calc_ent_grap(mydata['occupation'],mydata['degree'])
#利用信息增益离散化连续变量年龄用以做得分评估
age_value_list = set(mydata['age'])
age_value_list=list(age_value_list)
point=[]
for i in range(len(age_value_list)-1):
point.append((age_value_list[i]+age_value_list[i+1])/2)
real_point=[]
def lisan(x,y):
a=0
dir_piont=0
for i in range(len(piont)):
dir_piont=point[i]
if ent<d_ent:
ent=d_ent
a=piont[i]
li.remove(4)
mydata['count']=1
o_group=mydata['count'].groupby([mydata['occupation']])
outter=o_group.sum().to_frame()
outter=outter.reset_index()
occu_count=outter['count']
#分组查询操作
#coding:utf-8
grouped=mydata['age'].groupby([mydata['degree'],mydata['occupation']])
s_group=mydata['count'].groupby([mydata['occupation'],mydata['degree']])
out=s_group.sum().to_frame()
out=out.reset_index()
out['radio']=0
a=0
for i in range(len(out)):
out.iloc[i,3]=float(out.iloc[i,2])/occu_count[a]
sq=(i+1)%3
if sq==0:
a=a+1
out['weight']=[5,2,10,5,2,10,5,2,10,5,2,10,5,2,10,5,2,10,5,2,10,5,2,10,5,2,10,5,2,10,5,2,10]
out['score']=map(lambda (a,b):a*b, zip(out['weight'],out['radio']))
outter['score']=0
b=0
for i in range(len(out)):
outter.iloc[b,2]+=out.iloc[i,5]
sq=(i+1)%3
if sq==0:
b=b+1
out
outter
mydata[(mydata['age']>50) & (mydata['age']<60)]
#风险量化
risk={'轻度':2,'中度':5,'重度':10}
mydata['loss']=float(0)
data=mydata.as_matrix(columns=None)
i=0
for customer in data:
inv=customer[9]
rk=risk[customer[5]]
weak=mh.exp(-1*inv/670)
mydata.iloc[i,10]=rk*weak
i=i+1
mydata
#风险的分类加求平均
sumrisk={}
sumfreq={}
for i in range(len(mydata)):
sumfreq[mydata.iloc[i,6]]=sumfreq.setdefault(mydata.iloc[i,6],0)+1
sumrisk[mydata.iloc[i,6]]=sumrisk.setdefault(mydata.iloc[i,6],0)+float(mydata.iloc[i,10]) #通过键给到键值
avg_risk={}
for key in sumfreq:
avg_risk[key]=round(sumrisk[key]/sumfreq[key],2)
avg_risk= json.dumps(avg_risk, ensure_ascii=False) #dumps是将dict转化成str格式,可以输出中文
avg_risk