本文共 7611 字,大约阅读时间需要 25 分钟。
本文将介绍:
下载数据的网址如下:
https://storage.googleapis.com/tf-datasets/titanic/train.csvhttps://storage.googleapis.com/tf-datasets/titanic/eval.csv
将下载的数据读取到pandas中,代码如下:
#!/usr/bin/env python3# -*- coding: utf-8 -*-import matplotlib as mplimport matplotlib.pyplot as pltimport numpy as npimport sklearnimport pandas as pdimport osimport sysimport timeimport tensorflow as tffrom tensorflow import keras# 打印使用的python库的版本信息print(tf.__version__)print(sys.version_info)for module in mpl, np, pd, sklearn, tf, keras: print(module.__name__, module.__version__)train_file = "./data/titanic/train.csv"eval_file = "./data/titanic/eval.csv"train_df = pd.read_csv(train_file)eval_df = pd.read_csv(eval_file)print(train_df.head())print(eval_df.head())
y_train = train_df.pop('survived')y_eval = eval_df.pop('survived')print(train_df.head())print(eval_df.head())print(y_train.head())print(y_eval.head())
train_df.describe()
# 将特征分为"离散特征"和"连续特征"两个列表categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']numeric_columns = ['age', 'fare']feature_columns = []# 使用tf.feature_column对"离散特征"做处理for categorical_column in categorical_columns: vocab = train_df[categorical_column].unique() print(categorical_column, vocab) feature_columns.append( tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( categorical_column, vocab)))# 使用tf.feature_column对"连续特征"做处理for categorical_column in numeric_columns: feature_columns.append( tf.feature_column.numeric_column( categorical_column, dtype=tf.float32))
def make_dataset(data_df, label_df, epochs = 10, shuffle = True, batch_size = 32): dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df)) if shuffle: dataset = dataset.shuffle(10000) dataset = dataset.repeat(epochs).batch(batch_size) return dataset
# 创建-LinearClassifier预定义estimator模型linear_output_dir = 'linear_model'if not os.path.exists(linear_output_dir): os.mkdir(linear_output_dir)linear_estimator = tf.estimator.LinearClassifier( model_dir = linear_output_dir, n_classes = 2, feature_columns = feature_columns)# 训练-LinearClassifier预定义estimator模型linear_estimator.train(input_fn = lambda : make_dataset( train_df, y_train, epochs = 100))# 评估-LinearClassifier预定义estimator模型结果linear_estimator.evaluate(input_fn = lambda : make_dataset( eval_df, y_eval, epochs = 1, shuffle = False))
# 创建-DNNClassifier预定义estimator模型dnn_output_dir = './dnn_model'if not os.path.exists(dnn_output_dir): os.mkdir(dnn_output_dir)dnn_estimator = tf.estimator.DNNClassifier( model_dir = dnn_output_dir, n_classes = 2, feature_columns=feature_columns, hidden_units = [128, 128], activation_fn = tf.nn.relu, optimizer = 'Adam')# 训练-DNNClassifier预定义estimator模型dnn_estimator.train(input_fn = lambda : make_dataset( train_df, y_train, epochs = 100))# 评估-DNNClassifier预定义estimator模型结果dnn_estimator.evaluate(input_fn = lambda : make_dataset( eval_df, y_eval, epochs = 1, shuffle = False))
# 使用tf.feature_column.indicator_column对多个特征做 <交叉特征> 处理# cross feature: age: [1,2,3,4,5], gender:[male, female]# age_x_gender: [(1, male), (2, male), ..., (5, male), ..., (5, female)]# 100000: 100 -> hash(100000 values) % 100 feature_columns.append( tf.feature_column.indicator_column( tf.feature_column.crossed_column( ['age', 'sex'], hash_bucket_size = 100))) 交叉特征>
#!/usr/bin/env python3# -*- coding: utf-8 -*-"""Created on Mon Dec 28 14:03:56 2020@author: nijiahui"""import matplotlib as mplimport matplotlib.pyplot as pltimport numpy as npimport sklearnimport pandas as pdimport osimport sysimport timeimport tensorflow as tffrom tensorflow import keras# 打印使用的python库的版本信息print(tf.__version__)print(sys.version_info)for module in mpl, np, pd, sklearn, tf, keras: print(module.__name__, module.__version__)### 一,加载Titanic数据集 ########################################### 1,下载Titanic数据集,使用pandas读取并解析数据集# https://storage.googleapis.com/tf-datasets/titanic/train.csv# https://storage.googleapis.com/tf-datasets/titanic/eval.csvtrain_file = "./data/titanic/train.csv"eval_file = "./data/titanic/eval.csv"train_df = pd.read_csv(train_file)eval_df = pd.read_csv(eval_file)print(train_df.head())print(eval_df.head())# 2,分离出特征值和目标值y_train = train_df.pop('survived')y_eval = eval_df.pop('survived')print(train_df.head())print(eval_df.head())print(y_train.head())print(y_eval.head())# 3,使用panda对数值型数据的字段进行统计train_df.describe()### 二,使用feature_column做数据处理,并转化为tf.data.dataset类型数据 ############### 1,将"离散特征"和"连续特征"整合为one-hot编码# 将特征分为"离散特征"和"连续特征"两个列表categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class','deck', 'embark_town', 'alone']numeric_columns = ['age', 'fare']feature_columns = []# 使用tf.feature_column对"离散特征"做处理for categorical_column in categorical_columns: vocab = train_df[categorical_column].unique() print(categorical_column, vocab) feature_columns.append( tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( categorical_column, vocab)))# 使用tf.feature_column对"连续特征"做处理for numeric_column in numeric_columns: feature_columns.append( tf.feature_column.numeric_column( numeric_column, dtype=tf.float32))# 使用tf.feature_column.indicator_column对多个特征做 <交叉特征> 处理# cross feature: age: [1,2,3,4,5], gender:[male, female]# age_x_gender: [(1, male), (2, male), ..., (5, male), ..., (5, female)]# 100000: 100 -> hash(100000 values) % 100 feature_columns.append( tf.feature_column.indicator_column( tf.feature_column.crossed_column( ['age', 'sex'], hash_bucket_size = 100)))# 2,将ndarray数据转化为tf.data.dataset中的BatchDataset类型数据def make_dataset(data_df, label_df, epochs = 10, shuffle = True,batch_size = 32): dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df)) if shuffle: dataset = dataset.shuffle(10000) dataset = dataset.repeat(epochs).batch(batch_size) return dataset### 三,LinearClassifier# 创建-LinearClassifier预定义estimator模型linear_output_dir = 'linear_model_new_features'if not os.path.exists(linear_output_dir): os.mkdir(linear_output_dir)linear_estimator = tf.estimator.LinearClassifier( model_dir = linear_output_dir, n_classes = 2, feature_columns = feature_columns)# 训练-LinearClassifier预定义estimator模型linear_estimator.train(input_fn = lambda : make_dataset(train_df, y_train, epochs = 100))# 评估-LinearClassifier预定义estimator模型结果linear_estimator.evaluate(input_fn = lambda : make_dataset(eval_df, y_eval, epochs = 1, shuffle = False))### 四,DNNClassifier# 创建-DNNClassifier预定义estimator模型dnn_output_dir = './dnn_model_new_features'if not os.path.exists(dnn_output_dir): os.mkdir(dnn_output_dir)dnn_estimator = tf.estimator.DNNClassifier( model_dir = dnn_output_dir, n_classes = 2, feature_columns=feature_columns, hidden_units = [128, 128], activation_fn = tf.nn.relu, # optimizer = 'Adam' optimizer = 'SGD' )# 训练-DNNClassifier预定义estimator模型dnn_estimator.train(input_fn = lambda : make_dataset(train_df, y_train, epochs = 100))# 评估-DNNClassifier预定义estimator模型结果dnn_estimator.evaluate(input_fn = lambda : make_dataset(eval_df, y_eval, epochs = 1, shuffle = False)) 交叉特征>
转载地址:http://pvili.baihongyu.com/