В настоящее время ваш код, который читает
train_size = 1277055
test_size = 319264
if len(weather) > train_size:
first = weather[:1277055]
rest = weather[319264:]
, определяет остаток как все строки после 319264-го, в то время как первый - это правильно первые 1277055 строк.Возможно, вместо этого вы хотели
train_size = 1277055
test_size = 319264
if len(weather) > (train_size + test_size):
first = weather.iloc[:train_size, :]
rest = weather.iloc[(train_size + 1):(train_size + test_size + 1), :] # same as weather[1277056:1596320, :]
Альтернативно с sklearn's train_test_split:
train_size = 1277055
test_size = 319264
train_idx, test_idx = train_test_split(weather.index, train_size = train_size , test_size = test_size )
df_train = weather.iloc[train_idx, :]
df_test = weather.iloc[test_idx, :]
Пример использования:
In [1]: import numpy as np
...: import pandas as pd
...: train_size = 1277055
...: test_size = 319264
...: weather = pd.DataFrame(np.random.randint(0,100,size=(train_size+test_size, 4)), columns=list('ABCD'))
...: print(weather.head())
A B C D
0 13 91 68 35
1 52 30 52 59
2 16 22 73 24
3 62 86 27 96
4 88 54 23 4
In [2]: if len(weather) >= (train_size + test_size):
...: print('subsetting')
...: first = weather.iloc[:train_size, :]
...: rest = weather.iloc[(train_size + 1):(train_size + test_size + 1), :]
...:
...: print(first.shape)
...: print(rest.shape)
...:
subsetting
(1277055, 4)
(319263, 4)