AI-challenger-stock 數據處理代碼

數據處理準備 解讀(冗長版)

返回 讀代碼系列

"""n如何處理和準備ai-challenger-stock的數據n**Author**: `https://github.com/EmbraceLife/shendusuipian`n"""nimport pandas as pdnimport numpy as npnimport matplotlib.pyplot as pltnimport bcolznfrom sklearn.preprocessing import MinMaxScalernfrom sklearn.preprocessing import StandardScalernn# load train, test set from csvn# csv: id, headersn# train_set file: contain training and dev setn# test_set file: just test setn# return (train_set, test_set, test_id)ndef get_train_test_sets(train_file, test_file):nt# print all rows and columns when neededn pd.get_option(display.max_rows)n pd.set_option(display.max_rows, None)n pd.set_option(display.max_columns, None)n # pd.set_option(display.max_column_width, None)nnt# load into pd.DataFramen train_set = pd.read_csv(train_file)n test_set = pd.read_csv(test_file)nnt# extract test_set id Series, for submissionn test_id = test_set.loc[:, id]nnt# use id as indexn train_set.set_index(id, inplace=True) # set id as indexn test_set.set_index(id, inplace=True)n return (train_set, test_set, test_id)nn# standarize all features with standard scaler or mimax scalern# raw_features: np.array or pd.DataFramen# scaler: stad or mimaxn# return: np.arrayndef standardization(raw_features, scaler=stad):nt# standardize arrays: stad vs mimaxn """ raw_features: pd.DataFrame or np.array both okn """n if scaler == stad:n standardized_features = StandardScaler().fit_transform(raw_features)n elif scaler == mimax:n standardized_features = MinMaxScaler().fit_transform(raw_features)n return standardized_featuresnn# combine train_set and test_setn# train and test sets are pd.DataFramen# train_set: 包含training and validation setsn# select featuresn# combine train and testn# standardize it alln# return np.arrayndef combine_train_test(train_dev_set, test_set):nt# drop unwanted to select wantedn train_dev_features = train_dev_set.drop([weight, era, label, group], axis=1)n test_set = test_set.drop([group], axis=1)nt# pd.concat to merge dataframesn train_test_combine = pd.concat([train_dev_features, test_set], axis=0).valuesnt# use a custom function called standarization(...)n # train_test_combine = standardization(train_test_combine, scaler=stad) # standarizationnn return train_test_combinenn# make (samples, features) into (samples, window, features)n# train_test_combine: np.array, window: time_stepsn# return np.array (samples, window, features)ndef make_steps(train_test_combine, window=1):n p = 0 # 計數n features_list = [] # 特徵值容器nnt# must use "<" here !!!! due to target is one day forwardn while p + window < train_test_combine.shape[0]:n x = train_test_combine[p:p + window, :]ntt# 第一次循環:A_arr[0:30,:] 實際[0,1,2...29], 第二次循環:A_arr[1:31,:]...ntt# 每30天為一個新樣本,賦值給x,一天一天往前推進nn features_list.append(x)n p += 1nt# 將list 轉化為 np.arrayn train_test_sequence = np.asarray(features_list)n if window == 1:n train_test_sequence = train_test_sequence.reshape((train_test_sequence.shape[0], train_test_sequence.shape[2]))n return train_test_sequencennt# 第0-29天是特徵值, 第一個target 是從 label[30] 的label 開始的, 不是label[29]nn# split into train_sequence, train_target, train_weight,n# dev_sequence, dev_target, dev_weight, test_sequencendef split_train_dev(train_dev_set=None, test_set=None, train_test_sequence=None, odd_even=True, window=1):n # train_dev_set: 包含training and validation sets, pd.DataFrame,n if window == 1:n target = train_dev_set.loc[:, label] # label[30] 開始n weight = train_dev_set.loc[:, weight] # 用與損失函數中的樣本權重,時間位置應該與目標值對應上n era = train_dev_set.loc[:, era] # 用於劃分 dev setn else:n target = train_dev_set.loc[window:, label] # label[30] 開始n weight = train_dev_set.loc[window:, weight] # 用與損失函數中的樣本權重,時間位置應該與目標值對應上n era = train_dev_set.loc[window:, era] # 用於劃分 dev setnn # weight = standardization(weight.values.reshape((-1,1)), scaler=mimax) # input array must be 2-dn # weight = pd.DataFrame(weight)nn if window == 1:ntt# drop unwanted features or columns, remove group!!!!! or not !!!n test_sequence = test_set.drop([group], axis=1)n train_dev_sequence = train_dev_set.drop([group,weight, era, label], axis=1)n # test_sequence = test_setn # train_dev_sequence = train_dev_set.drop([weight, era, label], axis=1)n else:n n_test = test_set.shape[0]n test_sequence = train_test_sequence[-n_test:, :,:]n train_dev_sequence = train_test_sequence[:-n_test, :,:]nnt# how to select for train and dev setsn if odd_even:ntt# all odd era to be train_set, all even era to be dev setn if window == 1:n train_sequence = train_dev_sequence[era.values % 2 == 1] # training setn dev_sequence = train_dev_sequence[era.values % 2 == 0] # training setn else:n train_sequence = train_dev_sequence[era.values % 2 == 1, :, :] # training setn dev_sequence = train_dev_sequence[era.values % 2 == 0, :, :] # training setn train_target = target.values[era.values % 2 == 1]n train_weight = weight.values[era.values % 2 == 0]n dev_target = target.values[era.values % 2 == 1]n dev_weight = weight.values[era.values % 2 == 0]n else:ntt# select particular era to be train and dev setsn if window == 1:nttt# select by era numbern # train_sequence = train_dev_sequence.loc[(era.values!=2) & (era.values!=1),:]n train_sequence = train_dev_sequence.loc[(era.values>2),:]n # dev_sequence = train_dev_sequence.loc[(era.values==2) | (era.values==1),:]n dev_sequence = train_dev_sequence.loc[(era.values<=2),:]nn else:n train_sequence = train_dev_sequence.loc[(era.values!=2) & (era.values!=1),:,:]n dev_sequence = train_dev_sequence.loc[(era.values==2) | (era.values==1),:,:]n train_target = target.values[(era.values!=2) & (era.values!=1)]n train_weight = weight.values[(era.values!=2) & (era.values!=1)]n # train_target = target.values[(era.values>2)]n # train_weight = weight.values[(era.values>2)]n dev_target = target.values[(era.values==2) | (era.values==1)]n dev_weight = weight.values[(era.values==2) | (era.values==1)]n # dev_target = target.values[(era.values<=2)]n # dev_weight = weight.values[(era.values<=2)]n # train_weight_stad = standardization(train_weight.reshape((-1,1))) # input array must be 2-dn # dev_weight_stad = standardization(dev_weight.reshape((-1,1)))n # train_weight_mimax = standardization(train_weight.reshape((-1,1)), scaler=mimax)n # dev_weight_mimax = standardization(dev_weight.reshape((-1,1)), scaler=mimax)nn return (train_sequence, train_target, train_weight, dev_sequence, dev_target, dev_weight, test_sequence) # only for train and dev combinednn# save large np.arraysn# list_arrays: can only be 1-d arrays, not for 2-d or 3-d arraysn# learn h5ndef save_large_arrays(dir_path, list_arrays):nntc = bcolz.carray(list_arrays, rootdir=dir_path, mode=w)ntc.flush()ntprint("saved %d arrays" % len(list_arrays))nn# see all features distributions in one fituren# features_array must be 2-dndef all_features_distribution(features_array, first=None, middle=None, last=None):ntif first is not None:nttfeatures_array.iloc[:, :first].hist()nttplt.show()ntif last is not None:nttfeatures_array.iloc[:, -last:].hist()nttplt.show()ntif middle is not None:nttmid = round(features_array.shape[1]/2)ntthalf_middle = round(middle/2)nttfeatures_array.iloc[:, (mid-half_middle):(mid+half_middle)].hist()nttplt.show()nnn# examplesn# 選擇數據版本 20170910ndef tests():nt# test get_train_test_sets()nttrain_file = "/Users/Natsume/Documents/AI-challenger-stocks/train_data/20170929/ai_challenger_stock_train_20170929/stock_train_data_20170929.csv"nttest_file = "/Users/Natsume/Documents/AI-challenger-stocks/test_data/20170929/ai_challenger_stock_test_20170929/stock_test_data_20170929.csv"nttrain_dev_set, test_set, test_id = get_train_test_sets(train_file=train_file, test_file=test_file)nnt# display features distributions: first, middle, last few features if too manyntall_features_distribution(train_dev_set, middle=20)nnt# 保存test_set ID 信息ntnp.save("/Users/Natsume/Documents/AI-challenger-stocks/prepared_dataset/test_index.npy", test_id)nnt# test standardization() and combine_train_test()nttrain_test_combine = combine_train_test(train_dev_set=train_dev_set, test_set=test_set)ntm = train_test_combine.shape[0]nnt# only use the later half of dataset to train, validate, testnt# train_test_combine = train_test_combine[np.floor(m/2):,:]nnt# test make_steps()nttrain_test_sequence = make_steps(train_test_combine=train_test_combine)nnt# test split_train_dev()nttrain_sequence, train_target, train_weight, dev_sequence, dev_target, dev_weight, test_sequence = split_train_dev(train_dev_set=train_dev_set, test_set=test_set, train_test_sequence=train_test_sequence)nnnt# test for save_large_arrays()nt# use np.save for individual array as npy filent# train_sequence_file = "/Users/Natsume/Documents/AI-challenger-stocks/prepared_dataset/train_sequence.npy"nt# dev_sequence_file = "/Users/Natsume/Documents/AI-challenger-stocks/prepared_dataset/dev_sequence.npy"nt# test_sequence_file = "/Users/Natsume/Documents/AI-challenger-stocks/prepared_dataset/test_sequence.npy"nt#nt# np.save(train_sequence_file, train_sequence)nt# np.save(dev_sequence_file, dev_sequence)nt# np.save(test_sequence_file, test_sequence)nnt# use np.savez for more arrays as npz filenttrain_dev_test_sequence_npz = "/Users/Natsume/Documents/AI-challenger-stocks/prepared_dataset/train_dev_test_sequence.npz"nttrain_dev_target_weight_sequence_npz = "/Users/Natsume/Documents/AI-challenger-stocks/prepared_dataset/target_weight_norm.npz"nttrain_dev_target_weight_sequence_path = "/Users/Natsume/Documents/AI-challenger-stocks/prepared_dataset/target_weight_norm_dir/"nntlist_arrays = [train_target, dev_target, train_weight, dev_weight]ntsave_large_arrays(train_dev_target_weight_sequence_path, list_arrays)nntnp.savez(train_dev_target_weight_sequence_npz, train_target=train_target, dev_target=dev_target, train_weight=train_weight, dev_weight=dev_weight)nntnp.savez(train_dev_test_sequence_npz, train_sequence=train_sequence,dev_sequence=dev_sequence,test_sequence=test_sequence)nntnpzfiles = np.load(train_dev_test_sequence_npz)ntnpzfiles[train_sequence].shapennnnntests()n

推薦閱讀:

TAG:数据处理 | Python |