📄 datapreprocess.m
字号:
age_index=1;
workclass_index=2;
fnlwgt_index=3;
education_index=4;
education_num_index=5;
marital_status_index=6;
occupation_index=7;
relationship_index=8;
race_index=9;
sex_index=10;
capital_gain_index=11;
capital_loss_index=12;
hours_per_week_index=13;
native_country_index=14;
income_index=15;
workclass={'Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked'};
education={'Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th, 7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'};
marital_status={'Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'};
occupation={'Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'};
relationship={'Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'};
race={'White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other, Black'};
sex={'Female', 'Male'};
native_country={ 'United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands'};
income={'<=50K','>50K'};
%data={age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income};
%load train data from adult.data.
disp('load train data...');
fid=fopen('adult.data','r');
data=textscan(fid,'%f%s%f%s%f%s%s%s%s%s%f%f%f%s%s','delimiter',',');
fclose(fid);
%preprocess train data
disp('train data preprocessing...');
%age_index..
disp('convert age property...');
train_data(age_index,:)=data{age_index}';
%workclass_index...
disp('convert workclass property...');
train_data(workclass_index,:)=getCategoryIndex(data{workclass_index},workclass)';
%fnlwgt_index...
disp('convert fnlwgt property...');
train_data(fnlwgt_index,:)=data{fnlwgt_index}';
%education_index...
disp('convert education property...');
train_data(education_index,:)=getCategoryIndex(data{education_index},education)';
%education_num_index...
disp('convert education_num property...');
train_data(education_num_index,:)=data{education_num_index}';
%marital_status_index..
disp('convert marital_status property...');
train_data(marital_status_index,:)=getCategoryIndex(data{marital_status_index},marital_status)';
%occupation_index...
disp('convert occupation property...');
train_data(occupation_index,:)=getCategoryIndex(data{occupation_index},occupation)';
%relationship_index...
disp('convert relationship property...');
train_data(relationship_index,:)=getCategoryIndex(data{relationship_index},relationship)';
%race_index...
disp('convert race property...');
train_data(race_index,:)=getCategoryIndex(data{race_index},race)';
%sex_index...
disp('convert sex property...');
train_data(sex_index,:)=getCategoryIndex(data{sex_index},sex)';
%capital_gain_index...
disp('convert capital_gain property...');
train_data(capital_gain_index,:)=data{capital_gain_index}';
%capital_loss_index...
disp('convert capital_loss property...');
train_data(capital_loss_index,:)=data{capital_loss_index}';
%hours_per_week_index...
disp('convert hours_per_week property...');
train_data(hours_per_week_index,:)=data{hours_per_week_index}';
%native_country_index...
disp('convert native_country property...');
train_data(native_country_index,:)=getCategoryIndex(data{native_country_index},native_country)';
%train_target............
disp('convert train_target property...');
train_target(1,:)=getCategoryIndex(data{income_index},income)';
train_target=train_target-1;
%normalize the train data.
disp('normalize the train data property...');
for i=1:length(train_data(:,1))
normalized_train_data(i,:)=( train_data(i,:)-min( train_data(i,:)))/(max( train_data(i,:))-min( train_data(i,:)));
end
disp('train data processing finish...................................')
disp('load test data property...');
fid=fopen('adult.test','r');
test_data=textscan(fid,'%f%s%f%s%f%s%s%s%s%s%f%f%f%s%s','delimiter',',','headerLines',1);
fclose(fid);
disp('test data preprocessing...')
%age_index..
disp('convert age property...');
test_train_data(age_index,:)=test_data{age_index}';
%workclass_index...
disp('convert workclass property...');
test_train_data(workclass_index,:)=getCategoryIndex(test_data{workclass_index},workclass)';
%fnlwgt_index...
disp('convert fnlwgt property...');
test_train_data(fnlwgt_index,:)=test_data{fnlwgt_index}';
%education_index...
disp('convert education property...');
test_train_data(education_index,:)=getCategoryIndex(test_data{education_index},education)';
%education_num_index...
disp('convert education_num property...');
test_train_data(education_num_index,:)=test_data{education_num_index}';
%marital_status_index..
disp('convert marital_status property...');
test_train_data(marital_status_index,:)=getCategoryIndex(test_data{marital_status_index},marital_status)';
%occupation_index...
disp('convert occupation property...');
test_train_data(occupation_index,:)=getCategoryIndex(test_data{occupation_index},occupation)';
%relationship_index...
disp('convert relationship property...');
test_train_data(relationship_index,:)=getCategoryIndex(test_data{relationship_index},relationship)';
%race_index...
disp('convert race property...');
test_train_data(race_index,:)=getCategoryIndex(test_data{race_index},race)';
%sex_index...
disp('convert sex property...');
test_train_data(sex_index,:)=getCategoryIndex(test_data{sex_index},sex)';
%capital_gain_index...
disp('convert capital_gain property...');
test_train_data(capital_gain_index,:)=test_data{capital_gain_index}';
%capital_loss_index...
disp('convert capital_loss property...');
test_train_data(capital_loss_index,:)=test_data{capital_loss_index}';
%hours_per_week_index...
disp('convert hours_per_week property...');
test_train_data(hours_per_week_index,:)=test_data{hours_per_week_index}';
%native_country_index...
disp('convert native_country property...');
test_train_data(native_country_index,:)=getCategoryIndex(test_data{native_country_index},native_country)';
%train_target............
disp('convert train_target property...');
test_train_target(1,:)=getCategoryIndex(test_data{income_index},{'<=50K.','>50K.'})';
test_train_target=test_train_target-1;
%normolize the test data...
for i=1:length(test_train_data(:,1))
normalized_test_train_data(i,:)=( test_train_data(i,:)-min( test_train_data(i,:)))/(max( test_train_data(i,:))-min( test_train_data(i,:)));
end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -