def get_debt_rank(self): ''' 统计各年龄段债务情况(严重/不严重) :return: ''' all_list=[] bin = [0, 30, 45, 60, 75, 100] df_age_debt = df.select(df['age'], df['DebtRatio']) age_debt_y = [] for i in range(5): y0 = df_age_debt.filter(df['age'].between(bin[i], bin[i + 1])). filter(df['DebtRatio'] < 1).count() print(y0) y1 = df_age_debt.filter(df['age'].between(bin[i], bin[i + 1])). filter(df['DebtRatio'] >= 1).count() print(y1) age_debt_y.append([y0, y1]) all_list.append(age_debt_y) # 数据可视化data_web.py return all_list
统计各年龄情况
def XiangGuanXing(self): feature=['y','age','30-59days','DebtRatio','MonthlyIncome','num_late_card','60-89days','RealEstateLoans','families'] heat_map_data_arr=[] for i in range(len(feature)): for j in range(len(feature)): single_arr=[] if(i!=j): df_ = df.select(df[feature[i]].cast('float'), df[feature[j]].cast('float')) result = df_.corr(feature[i], feature[j], 'pearson') single_arr.append(i) single_arr.append(j) single_arr.append(result) else: single_arr.append(i) single_arr.append(j) single_arr.append(1) heat_map_data_arr.append(single_arr) #转换成热力图需要的数据格式 print(heat_map_data_arr) return heat_map_data_arr
计算相关性
def get_score(self): ''' 各用户评分 :return: ''' feature=['y','30-59days','DebtRatio','num_late_card','60-89days','RealEstateLoans'] pearson_arr=[] sum=0 for i in range(len(feature)): if (i != 0): df_ = df.select(df[feature[i]].cast('float'), df['y'].cast('float')) result = df_.corr(feature[i], 'y', 'pearson') sum=sum+abs(result) pearson_arr.append(result) else: pearson_arr.append(1) sum=sum+1 precent_arr=[] for temp in pearson_arr: precent_arr.append(abs(temp)/sum) print(precent_arr) df_score=df.withColumn('score',df['y']*precent_arr[0]+df['30-59days']*precent_arr[1]+df['num_late_card']*precent_arr[2] +df['60-89days']*precent_arr[3]+df['RealEstateLoans']*precent_arr[4]) df_single_score = df_score.select(df_score['ID'], df_score['score']) df_sort_score=df_single_score.sort('score').collect() best_customer=[] for i in range(10): single_best=[] temp=df_sort_score[i] temp_char=str(temp) temp_char=temp_char.strip('Row()') print(temp_char) temp_arr=temp_char.split(',') ID=temp_arr[0].strip("ID=''") score=temp_arr[1].split('=') print(ID,score[1]) single_best.append(int(ID)) single_best.append(float(score[1])) best_customer.append(single_best) print(best_customer) worst_customer = [] for i in range(10): len_arr=len(df_sort_score) single_worst = [] temp = df_sort_score[len_arr-1-i] temp_char = str(temp) temp_char = temp_char.strip('Row()') print(temp_char) temp_arr = temp_char.split(',') ID = temp_arr[0].strip("ID=''") score = temp_arr[1].split('=') print(ID, score[1]) single_worst.append(ID) single_worst.append(score[1]) worst_customer.append(single_worst) print(worst_customer) return best_customer,worst_customer
统计得分情况
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)