Source From Here
libsvm 與 dataframe格式相互轉換
libsvm 轉化為 dataframe
如果 libsvm 文件的特徵索引是亂序的,直接使用 load_svmlight_file 讀取會報錯,採用下面的函數將每行數據的索引轉化為正序排列:
dataframe 轉化為 libsvm
dataframe 轉換為 libffm格式
libsvm 轉化為 dataframe
- ## 将libsvm转为dataframe
- from sklearn.datasets import load_svmlight_file
- from pandas import DataFrame
- import pandas as pd
- X_train, y_train = load_svmlight_file("libsvm_data.txt")
- mat = X_train.todense()
- df1 = pd.DataFrame(mat)
- df1.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
- df2 = pd.DataFrame(y_train)
- df2.columns = ['target']
- df = pd.concat([df2, df1], axis=1) # 第一列为target
- df.to_csv("df_data.txt", index=False)
- ## 将索引乱序的libsvm文件转化为索引排序的文件
- def libsvm_index_order(input_file, out_file):
- with open(input_file, 'r') as f_in, open(out_file, 'w') as f_out:
- for line in f_in.readlines():
- items = line.strip().split()
- features = {}
- for i in range(1, len(items)):
- key, value = items[i].split(":")
- features[int(key)] = value
- features_sort = sorted(features.items(), key = lambda k: k[0])
- row_order = items[0]
- for item in features_sort:
- feature = ":".join((str(item[0]), item[1]))
- row_order = row_order + " " + feature
- f_out.write(row_order + "\n")
- input_file = "./ml-tag.train.libfm"
- ## 将 dataframe 转为 libsvm
- import pandas as pd
- from sklearn.datasets import dump_svmlight_file
- df = pd.read_csv("data.txt") # 第一个字段为target
- y = df.target # y为数据的label值
- dummy = pd.get_dummies(df.iloc[:, 1:])
- mat = dummy.as_matrix()
- dump_svmlight_file(mat, y, 'svm_output.libsvm', zero_based=False) # 默认为zero_based=True,转换后的字段编号从0开始
- import numpy as np
- import pandas as pd
- from sklearn.datasets import make_classification
- class FFMFormatPandas:
- def __init__(self):
- self.field_index_ = None
- self.feature_index_ = None
- self.y = None
- def fit(self, df, y=None):
- self.y = y
- df_ffm = df[df.columns.difference([self.y])]
- if self.field_index_ is None:
- self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
- if self.feature_index_ is not None:
- last_idx = max(list(self.feature_index_.values()))
- if self.feature_index_ is None:
- self.feature_index_ = dict()
- last_idx = 0
- for col in df.columns:
- vals = df[col].unique()
- for val in vals:
- if pd.isnull(val):
- continue
- name = '{}_{}'.format(col, val)
- if name not in self.feature_index_:
- self.feature_index_[name] = last_idx
- last_idx += 1
- self.feature_index_[col] = last_idx
- last_idx += 1
- return self
- def fit_transform(self, df, y=None):
- self.fit(df, y)
- return self.transform(df)
- def transform_row_(self, row, t):
- ffm = []
- if self.y != None:
- ffm.append(str(row.loc[row.index == self.y][0]))
- if self.y is None:
- ffm.append(str(0))
- for col, val in row.loc[row.index != self.y].to_dict().items():
- col_type = t[col]
- name = '{}_{}'.format(col, val)
- if col_type.kind == 'O':
- ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
- elif col_type.kind == 'i':
- ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
- return ' '.join(ffm)
- def transform(self, df):
- t = df.dtypes.to_dict()
- return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
- ########################### Lets build some data and test ############################
- train, y = make_classification(n_samples=100, n_features=5, n_informative=2, n_redundant=2, n_classes=2, random_state=42)
- train=pd.DataFrame(train, columns=['int1','int2','int3','s1','s2'])
- train['int1'] = train['int1'].map(int)
- train['int2'] = train['int2'].map(int)
- train['int3'] = train['int3'].map(int)
- train['s1'] = round(np.log(abs(train['s1'] +1 ))).map(str)
- train['s2'] = round(np.log(abs(train['s2'] +1 ))).map(str)
- train['clicked'] = y
- ffm_train = FFMFormatPandas()
- ffm_train_data = ffm_train.fit_transform(train, y='clicked')
- print('Base data')
- print(train[0:10])
- print('FFM data')
- print(ffm_train_data[0:10])
This message was edited 4 times. Last update was at 15/09/2020 10:12:01
沒有留言:
張貼留言