#读入数据
with gzip.open('morethan_mfcc.pkl', 'rb') as ifp:
df = pickle.load(ifp)
pd.set_option('display.width', 100)
df = df.reset_index()
#print(df)
df.drop('index', inplace=True, axis=1)
for index, row in df.iterrows():
df.at[index, 'lengths']=int(row['MFCC'].shape[0])
print(df.head())
MFCC ........................ lengths
0 [[-10.494777560823318, 19.205118287686812, 16.... ... 1099.0
1 [[-11.632857909892738, 5.8760442872698695, 29.... ... 1099.0
2 [[-8.773702814090234, -12.86215112148708, -7.2... ... 299.0
3 [[-8.411895008114222, -1.9307392294278005, -7.... ... 299.0
4 [[-6.969186995608002, -17.02179303376442, -23.... ... 299.0
le = LabelEncoder()
le.fit(df['Label'])
le.classes_
print(le.classes_)
#数据预准备
df_train,df_test = train_test_split(df, test_size=0.2, random_state=0)
df_test, df_val = train_test_split(df_test, test_size=0.75, random_state=0)
print('训练数:',df_train.shape[0])
print('验证数:',df_val.shape[0])
print('测试数:',df_test.shape[0])
训练数: 25
验证数: 6
测试数: 1
class BucketedDataIterator():
def __init__(self, df, num_buckets=7):
df = df.sort_values('lengths').reset_index(drop=True)
self.size = len(df) / num_buckets
self.dfs = []
for bucket in range(num_buckets):
self.dfs.append(df.loc[bucket * self.size: (bucket + 1) * self.size - 1])
self.num_buckets = num_buckets
# cursor[i] will be the cursor for the ith bucket
self.cursor = np.array([0] * num_buckets)
self.shuffle()
self.epochs = 0
def shuffle(self):
# sorts dataframe by sequence length, but keeps it random within the same length
for i in range(self.num_buckets):
self.dfs[i] = self.dfs[i].sample(frac=1).reset_index(drop=True)
self.cursor[i] = 0
def next_batch(self, n):
if np.any(self.cursor + n + 1 > self.size):
self.epochs += 1
self.shuffle()
i = np.random.randint(0, self.num_buckets)
res = self.dfs[i].loc[self.cursor[i]:self.cursor[i] + n - 1]
self.cursor[i] += n
# Pad sequences with 0s so they are all the same length
maxlen = int(max(res['lengths']))
x = np.zeros([n, maxlen], dtype=np.int32)
#原码是:x = np.zeros([n, maxlen,32], dtype=np.float)也不行
for i, x_i in enumerate(x):
x_i[:res['lengths'].values[i]] = res['MFCC'].values[i]
return x, le.transform(res['Label'].values), res['lengths']
tr = BucketedDataIterator(df_train)
mini_batch = tr.next_batch(128)
X = mini_batch[0]
y = mini_batch[1]
seqlen = mini_batch[2]
print('X shape:', X.shape)
print('Target:', y[:5])
print('Sequence length:\n', seqlen[:5])
Traceback (most recent call last):
File "D:/Project/pycharm/RNN/create_mfcc.py", line 164, in <module>
mini_batch = tr.next_batch(128)
File "D:/Project/pycharm/RNN/create_mfcc.py", line 160, in next_batch
x_i[:res['lengths'].values[i]] = res['MFCC'].values[i]
TypeError: slice indices must be integers or None or have an __index__ method