add feature column
parent
88b429730a
commit
6da4a33659
|
@ -0,0 +1,325 @@
|
|||
## 1 tensorflow中的特征工程
|
||||
传统的机器学习中,特征工程占有非常重要的地位。与其说算法工程师是在做算法,不如说是做数据,做特征。通常花费60%-70%的时间做特征工程也是很常见。由此可见特征工程的重要性与复杂性。
|
||||
深度学习的崛起,很大程度上是减少了特征工程的工作量,可以让网络自己去学习各种特征之间的组合,即所谓的端到端(end to end)学习。
|
||||
即便如此,特征工程仍然是深度学习算法中的重要一环,前期数据的预处理质量仍然很大程度决定了算法的最终效果。
|
||||
|
||||
tensorflow中提供了feature columns的相关API。feature columns是输入特征数据与estimator(真正的算法模型)之间沟通的桥梁。
|
||||
因为深度神经网络只能处理数值类的数据,算法的核心步骤是前向的加法模型与梯度的反向传播,这里面其实就是数值的加法与乘法运算。但在实际中,除了连续的数值变量,更多的是非数值的类别特征,比如性别,地域这种。
|
||||
因此,feature column可以将特征作为单个语义单元来操作,指定转换并选择要包括的特征,然后就可以直接喂给模型,而不需要人工再进行复杂的转换与运算。
|
||||
|
||||
## 2.数值类特征转换
|
||||
### 2.1 numeric_column
|
||||
numeric_column主要处理连续型变量,可以是float也可以是int,从table中读取对应的key,并将其转成相应的dtype。
|
||||
|
||||
```
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
def t1():
|
||||
features = {"price": [1.5, 1.3, 2.1, 2.5, 0.6, 3.1, 3.2]}
|
||||
price = tf.feature_column.numeric_column("price", default_value=2.5)
|
||||
columns = [price]
|
||||
|
||||
# 输入层(数据列,特征列)
|
||||
input_layer = tf.feature_column.input_layer(features, columns)
|
||||
|
||||
with tf.Session() as sess:
|
||||
init = tf.global_variables_initializer()
|
||||
sess.run(tf.tables_initializer())
|
||||
sess.run(init)
|
||||
|
||||
result = sess.run(input_layer)
|
||||
print(result)
|
||||
print(type(result))
|
||||
|
||||
```
|
||||
|
||||
最终结果为
|
||||
```
|
||||
[[1.5]
|
||||
[1.3]
|
||||
[2.1]
|
||||
[2.5]
|
||||
[0.6]
|
||||
[3.1]
|
||||
[3.2]]
|
||||
<class 'numpy.ndarray'>
|
||||
```
|
||||
|
||||
### 2.2 bucketized_column
|
||||
bucketized_column也是用来处理连续变量类型。与numeric_column不同的是,其会将特征bucketized(分桶)离散化,最后得到的是离散化以后的one-hot结果。
|
||||
```
|
||||
'''
|
||||
[-inf, 0), [0, 1), [1, 2), [2, 3), [3, +inf)
|
||||
'''
|
||||
def t2():
|
||||
features = {"price": [1.5, 1.3, 2.1, 2.5, 0.6, 3.1, 3.2]}
|
||||
step = 1.0
|
||||
boundaries = list(np.arange(0, 4, step))
|
||||
price = tf.feature_column.bucketized_column(tf.feature_column.numeric_column("price", default_value=2.5),
|
||||
boundaries=boundaries)
|
||||
columns = [price]
|
||||
input_layer = tf.feature_column.input_layer(features, columns)
|
||||
|
||||
with tf.Session() as sess:
|
||||
init = tf.global_variables_initializer()
|
||||
sess.run(tf.tables_initializer())
|
||||
sess.run(init)
|
||||
|
||||
result = sess.run(input_layer)
|
||||
print(result)
|
||||
```
|
||||
最终结果为
|
||||
```
|
||||
[[0. 0. 1. 0. 0.]
|
||||
[0. 0. 1. 0. 0.]
|
||||
[0. 0. 0. 1. 0.]
|
||||
[0. 0. 0. 1. 0.]
|
||||
[0. 1. 0. 0. 0.]
|
||||
[0. 0. 0. 0. 1.]
|
||||
[0. 0. 0. 0. 1.]]
|
||||
```
|
||||
|
||||
上面的代码将数值空间分成了五个部分,所以最终one-hot出来的结果总共有5维。注意各区间是左闭右开。
|
||||
|
||||
|
||||
## 3.category类特征转换
|
||||
|
||||
### 3.1 categorical_column_with_identity
|
||||
categorical_column_with_identity的作用就是对特征进行one-hot编码。
|
||||
```
|
||||
def t1():
|
||||
features = {"category": [[1], [3], [1], [3], [2], [1]]}
|
||||
|
||||
category = tf.feature_column.categorical_column_with_identity("category", num_buckets=4, default_value=0)
|
||||
category = tf.feature_column.indicator_column(category)
|
||||
|
||||
columns = [category]
|
||||
|
||||
input_layer = tf.feature_column.input_layer(features, columns)
|
||||
|
||||
with tf.Session() as sess:
|
||||
init = tf.global_variables_initializer()
|
||||
sess.run(tf.tables_initializer())
|
||||
sess.run(init)
|
||||
|
||||
result = sess.run(input_layer)
|
||||
print(result)
|
||||
```
|
||||
|
||||
最终结果为
|
||||
```
|
||||
[[0. 1. 0. 0.]
|
||||
[0. 0. 0. 1.]
|
||||
[0. 1. 0. 0.]
|
||||
[0. 0. 0. 1.]
|
||||
[0. 0. 1. 0.]
|
||||
[0. 1. 0. 0.]]
|
||||
```
|
||||
源码中的有相关注释
|
||||
```
|
||||
@tf_export('feature_column.categorical_column_with_identity')
|
||||
def categorical_column_with_identity(key, num_buckets, default_value=None):
|
||||
"""A `CategoricalColumn` that returns identity values.
|
||||
|
||||
Use this when your inputs are integers in the range `[0, num_buckets)`, and
|
||||
you want to use the input value itself as the categorical ID. Values outside
|
||||
this range will result in `default_value` if specified, otherwise it will
|
||||
fail.
|
||||
|
||||
Typically, this is used for contiguous ranges of integer indexes, but
|
||||
it doesn't have to be. This might be inefficient, however, if many of IDs
|
||||
are unused. Consider `categorical_column_with_hash_bucket` in that case.
|
||||
|
||||
For input dictionary `features`, `features[key]` is either `Tensor` or
|
||||
`SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
|
||||
and `''` for string, which will be dropped by this feature column.
|
||||
|
||||
```
|
||||
根据注释不难看出,该方法只适用于值为整数的类别变量,其中num_buckets为类别的总数。
|
||||
|
||||
### 3.2 categorical_column_with_vocabulary_list categorical_column_with_vocabulary_file
|
||||
|
||||
上面的两个方法是根据单词出现的序列顺序,将其进行one-hot编码。
|
||||
如果对应的类别比较少,可以一一列举出来,则可以使用vocabulary_list方法。
|
||||
如果对应的类别比较多,无法一一列举而需要从文件中输入,则可以使用vocabulary_file。
|
||||
|
||||
|
||||
```
|
||||
def t2():
|
||||
features = {'sex': ['male', 'male', 'female', 'female']}
|
||||
sex_column = tf.feature_column.categorical_column_with_vocabulary_list('sex', ['male', 'female'])
|
||||
sex_column = tf.feature_column.indicator_column(sex_column)
|
||||
|
||||
columns = [sex_column]
|
||||
|
||||
input_layer = tf.feature_column.input_layer(features, columns)
|
||||
with tf.Session() as sess:
|
||||
init = tf.global_variables_initializer()
|
||||
sess.run(tf.tables_initializer())
|
||||
sess.run(init)
|
||||
|
||||
result = sess.run(input_layer)
|
||||
print(result)
|
||||
```
|
||||
最终的结果为
|
||||
```
|
||||
[[1. 0.]
|
||||
[1. 0.]
|
||||
[0. 1.]
|
||||
[0. 1.]]
|
||||
```
|
||||
|
||||
### 3.3 categorical_column_with_hash_bucket
|
||||
顾名思义,该方法对特征的处理方式为hash分桶。因为是hash算法,所以就有可能存在冲突。
|
||||
```
|
||||
def t3():
|
||||
features = {'department': ['sport', 'sport', 'drawing', 'gardening', 'travelling']}
|
||||
department = tf.feature_column.categorical_column_with_hash_bucket('department', 4, dtype=tf.string)
|
||||
department = tf.feature_column.indicator_column(department)
|
||||
|
||||
columns = [department]
|
||||
|
||||
input_layer = tf.feature_column.input_layer(features, columns)
|
||||
with tf.Session() as sess:
|
||||
init = tf.global_variables_initializer()
|
||||
sess.run(tf.tables_initializer())
|
||||
sess.run(init)
|
||||
|
||||
result = sess.run(input_layer)
|
||||
print(result)
|
||||
```
|
||||
最终输出结果为
|
||||
```
|
||||
[[0. 1. 0. 0.]
|
||||
[0. 1. 0. 0.]
|
||||
[0. 1. 0. 0.]
|
||||
[1. 0. 0. 0.]
|
||||
|
||||
```
|
||||
|
||||
### 3.4 crossed_column
|
||||
crossed_column是用来处理交叉特征的。传统机器学习方法中,我们经常人工设计各种交叉特征,比如最常见的将年龄与性别交叉,得到一维新的特征。
|
||||
|
||||
```
|
||||
def t4():
|
||||
features = {
|
||||
'sex': [1, 2, 1, 1, 2],
|
||||
'department': ['sport', 'sport', 'drawing', 'gardening', 'travelling'],
|
||||
}
|
||||
|
||||
# 特征列
|
||||
department = tf.feature_column.categorical_column_with_vocabulary_list('department',
|
||||
['sport', 'drawing', 'gardening',
|
||||
'travelling'], dtype=tf.string)
|
||||
sex = tf.feature_column.categorical_column_with_identity('sex', num_buckets=2, default_value=0)
|
||||
sex_department = tf.feature_column.crossed_column([department, sex], 16)
|
||||
sex_department = tf.feature_column.indicator_column(sex_department)
|
||||
# 组合特征列
|
||||
columns = [sex_department]
|
||||
input_layer = tf.feature_column.input_layer(features, columns)
|
||||
|
||||
with tf.Session() as sess:
|
||||
init = tf.global_variables_initializer()
|
||||
sess.run(tf.tables_initializer())
|
||||
sess.run(init)
|
||||
|
||||
result = sess.run(input_layer)
|
||||
print(type(result))
|
||||
print(result)
|
||||
```
|
||||
|
||||
最终输出结果为
|
||||
```
|
||||
<class 'numpy.ndarray'>
|
||||
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
|
||||
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
|
||||
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
|
||||
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
|
||||
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]
|
||||
|
||||
```
|
||||
|
||||
### 3.5 embedding_column
|
||||
对高维稀疏类别特征进行低维稠密压缩,是深度学习中各算法的标配。embedding_column就是用来处理embedding的。
|
||||
|
||||
```
|
||||
def t5():
|
||||
color_data = {'color': [['R'], ['G'], ['B'], ['A']]}
|
||||
color = tf.feature_column.categorical_column_with_vocabulary_list(
|
||||
'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)
|
||||
|
||||
color_embeding = tf.feature_column.embedding_column(color, 8)
|
||||
color_embeding_dense_tensor = tf.feature_column.input_layer(color_data, color_embeding)
|
||||
|
||||
with tf.Session() as sess:
|
||||
init = tf.global_variables_initializer()
|
||||
sess.run(tf.tables_initializer())
|
||||
sess.run(init)
|
||||
|
||||
result = sess.run(color_embeding_dense_tensor)
|
||||
print(result)
|
||||
```
|
||||
|
||||
最终输出的结果为
|
||||
```
|
||||
[[ 0.06087255 0.14802316 -0.53089684 0.3792207 -0.07220224 0.11126718
|
||||
0.13428093 0.05913785]
|
||||
[ 0.11378041 0.09111454 0.13312939 0.4688532 0.16119204 -0.00828113
|
||||
-0.07480547 -0.19241782]
|
||||
[-0.505195 0.14229129 0.17853943 -0.05520723 -0.19731279 -0.28580233
|
||||
-0.16668914 -0.68252677]
|
||||
[ 0. 0. 0. 0. 0. 0.
|
||||
0. 0. ]]
|
||||
```
|
||||
|
||||
简单看一下embedding_column的源码
|
||||
```
|
||||
@tf_export('feature_column.embedding_column')
|
||||
def embedding_column(categorical_column,
|
||||
dimension,
|
||||
combiner='mean',
|
||||
initializer=None,
|
||||
ckpt_to_load_from=None,
|
||||
tensor_name_in_ckpt=None,
|
||||
max_norm=None,
|
||||
trainable=True):
|
||||
"""`DenseColumn` that converts from sparse, categorical input.
|
||||
|
||||
Use this when your inputs are sparse, but you want to convert them to a dense
|
||||
representation (e.g., to feed to a DNN).
|
||||
|
||||
Inputs must be a `CategoricalColumn` created by any of the
|
||||
`categorical_column_*` function. Here is an example of using
|
||||
`embedding_column` with `DNNClassifier`:
|
||||
...
|
||||
Args:
|
||||
categorical_column: A `CategoricalColumn` created by a
|
||||
`categorical_column_with_*` function. This column produces the sparse IDs
|
||||
that are inputs to the embedding lookup.
|
||||
dimension: An integer specifying dimension of the embedding, must be > 0.
|
||||
combiner: A string specifying how to reduce if there are multiple entries in
|
||||
a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
|
||||
'mean' the default. 'sqrtn' often achieves good accuracy, in particular
|
||||
with bag-of-words columns. Each of this can be thought as example level
|
||||
normalizations on the column. For more information, see
|
||||
`tf.embedding_lookup_sparse`.
|
||||
initializer: A variable initializer function to be used in embedding
|
||||
variable initialization. If not specified, defaults to
|
||||
`tf.truncated_normal_initializer` with mean `0.0` and standard deviation
|
||||
`1/sqrt(dimension)`.
|
||||
ckpt_to_load_from: String representing checkpoint name/pattern from which to
|
||||
restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
|
||||
tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which
|
||||
to restore the column weights. Required if `ckpt_to_load_from` is not
|
||||
`None`.
|
||||
max_norm: If not `None`, embedding values are l2-normalized to this value.
|
||||
trainable: Whether or not the embedding is trainable. Default is True.
|
||||
|
||||
Returns:
|
||||
`DenseColumn` that converts from sparse input.
|
||||
```
|
||||
|
||||
重点关注一下两个参数:
|
||||
1.combiner。当一行有多个entries时,可以使用'mean', 'sqrtn', 'sum'的方式来对向量进行处理,默认的方法是mean。
|
||||
2.initializer 初始化embedding向量的方法,默认的是用`tf.truncated_normal_initializer`方法,均值为0.0,方差为`1/sqrt(dimension)`。
|
|
@ -0,0 +1,5 @@
|
|||
## 1.各种层
|
||||
1.tf.nn:最底层的函数,其他各种库可以说都是基于这个底层库来进行扩展的。
|
||||
2.tf.layers:比tf.nn更高级的库,对tf.nn进行了多方位功能的扩展。用程序员的话来说,就是用tf.nn造的轮子。最大的特点就是库中每个函数都有相应的类(函数名为大写,看了下底层源码,是从kears那迁移过来的)。
|
||||
3.tf.keras:如果说tf.layers是轮子,那么keras可以说是汽车。tf.keras是基于tf.layers和tf.nn的高度封装。
|
||||
4.tf.contrib.layers:tf.contrib.layers提供够将计算图中的 网络层、正则化、摘要操作、是构建计算图的高级操作,但是tf.contrib包含不稳定和实验代码,有可能以后API会改变。
|
|
@ -0,0 +1,82 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
#Author: WangLei
|
||||
#date: 2020/6/24
|
||||
|
||||
import os
|
||||
from six.moves.urllib.request import urlopen
|
||||
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
# Data sets
|
||||
IRIS_TRAINING = "data/iris_training.csv"
|
||||
IRIS_TRAINING_URL = "http://download.tensorflow.org/data/iris_training.csv"
|
||||
|
||||
IRIS_TEST = "data/iris_test.csv"
|
||||
IRIS_TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
|
||||
|
||||
if not os.path.exists(IRIS_TRAINING):
|
||||
raw = urlopen(IRIS_TRAINING_URL).read()
|
||||
with open(IRIS_TRAINING, 'wb') as f:
|
||||
f.write(raw)
|
||||
|
||||
|
||||
if not os.path.exists(IRIS_TEST):
|
||||
raw = urlopen(IRIS_TEST_URL).read()
|
||||
with open(IRIS_TEST, 'wb') as f:
|
||||
f.write(raw)
|
||||
|
||||
FEATURES = ['SepalLength', 'SepalWidth','PetalLength', 'PetalWidth', 'Species']
|
||||
SPECIES = ['Setosa', 'Versicolor', 'Virginica']
|
||||
|
||||
train = pd.read_csv(IRIS_TRAINING, names=FEATURES, header=0)
|
||||
train_x = train[['SepalLength', 'SepalWidth','PetalLength', 'PetalWidth']]
|
||||
train_y = train['Species']
|
||||
del train
|
||||
|
||||
test = pd.read_csv(IRIS_TEST, names=FEATURES, header=0)
|
||||
test_x = test[['SepalLength', 'SepalWidth','PetalLength', 'PetalWidth']]
|
||||
test_y = test['Species']
|
||||
del test
|
||||
|
||||
feature_columns = []
|
||||
for key in train_x.keys():
|
||||
feature_columns.append(tf.feature_column.numeric_column(key=key))
|
||||
|
||||
classifier = tf.estimator.DNNClassifier(
|
||||
feature_columns=feature_columns,
|
||||
hidden_units = [10, 10],
|
||||
n_classes=3
|
||||
)
|
||||
|
||||
|
||||
def train_input_fn(features, labels, batch_size):
|
||||
dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
|
||||
dataset = dataset.shuffle(1000).repeat().batch(batch_size)
|
||||
return dataset.make_one_shot_iterator().get_next()
|
||||
|
||||
batch_size = 100
|
||||
classifier.train(input_fn=lambda :train_input_fn(train_x, train_y, batch_size),steps=1000)
|
||||
|
||||
|
||||
def eval_input_fn(features, labels, batch_size):
|
||||
features = dict(features)
|
||||
inputs = (features, labels)
|
||||
dataset = tf.data.Dataset.from_tensor_slices(inputs)
|
||||
dataset = dataset.batch(batch_size)
|
||||
return dataset.make_one_shot_iterator().get_next()
|
||||
|
||||
eval_result = classifier.evaluate(
|
||||
input_fn = lambda :eval_input_fn(test_x, test_y, batch_size))
|
||||
|
||||
print(eval_result)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue