Я пытаюсь предсказать цену жилья для этого набора данных
Я пытаюсь использовать модель линейной регрессии, получаю ошибку значения как
ValueError: не удалось преобразовать строку в число с плавающей точкой: как показано ниже
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
data = pd.read_csv("Predicting-House-Prices-In-Bengaluru-Train-Data.csv")
data.head()
area_type availability location size society total_sqft bath balcony price
0 Super built-up Area 19-Dec Electronic City Phase II 2 BHK Coomee 1056 2.0 1.0 39.07
1 Plot Area Ready To Move Chikka Tirupathi 4 Bedroom Theanmp 2600 5.0 3.0 120.00
2 Built-up Area Ready To Move Uttarahalli 3 BHK NaN 1440 2.0 3.0 62.00
3 Super built-up Area Ready To Move Lingadheeranahalli 3 BHK Soiewre 1521 3.0 1.0 95.00
4 Super built-up Area Ready To Move Kothanur 2 BHK NaN 1200 2.0 1.0 51.00
data['location'].fillna('', inplace=True)
data.drop([47],axis=0)
area_type availability location size society total_sqft bath balcony price
0 Super built-up Area 19-Dec Electronic City Phase II 2 BHK Coomee 1056 2.0 1.0 39.07
1 Plot Area Ready To Move Chikka Tirupathi 4 Bedroom Theanmp 2600 5.0 3.0 120.00
2 Built-up Area Ready To Move Uttarahalli 3 BHK NaN 1440 2.0 3.0 62.00
3 Super built-up Area Ready To Move Lingadheeranahalli 3 BHK Soiewre 1521 3.0 1.0 95.00
4 Super built-up Area Ready To Move Kothanur 2 BHK NaN 1200 2.0 1.0 51.00
5 Super built-up Area Ready To Move Whitefield 2 BHK DuenaTa 1170 2.0 1.0 38.00
6 Super built-up Area 18-May Old Airport Road 4 BHK Jaades 2732 4.0 NaN 204.00
7 Super built-up Area Ready To Move Rajaji Nagar 4 BHK Brway G 3300 4.0 NaN 600.00
8 Super built-up Area Ready To Move Marathahalli 3 BHK NaN 1310 3.0 1.0 63.25
9 Plot Area Ready To Move Gandhi Bazar 6 Bedroom NaN 1020 6.0 NaN 370.00
10 Super built-up Area 18-Feb Whitefield 3 BHK NaN 1800 2.0 2.0 70.00
11 Plot Area Ready To Move Whitefield 4 Bedroom Prrry M 2785 5.0 3.0 295.00
12 Super built-up Area Ready To Move 7th Phase JP Nagar 2 BHK Shncyes 1000 2.0 1.0 38.00
13 Built-up Area Ready To Move Gottigere 2 BHK NaN 1100 2.0 2.0 40.00
14 Plot Area Ready To Move Sarjapur 3 Bedroom Skityer 2250 3.0 2.0 148.00
15 Super built-up Area Ready To Move Mysore Road 2 BHK PrntaEn 1175 2.0 2.0 73.50
16 Super built-up Area Ready To Move Bisuvanahalli 3 BHK Prityel 1180 3.0 2.0 48.00
17 Super built-up Area Ready To Move Raja Rajeshwari Nagar 3 BHK GrrvaGr 1540 3.0 3.0 60.00
18 Super built-up Area Ready To Move Ramakrishnappa Layout 3 BHK PeBayle 2770 4.0 2.0 290.00
19 Super built-up Area Ready To Move Manayata Tech Park 2 BHK NaN 1100 2.0 2.0 48.00
20 Built-up Area Ready To Move Kengeri 1 BHK NaN 600 1.0 1.0 15.00
21 Super built-up Area 19-Dec Binny Pete 3 BHK She 2rk 1755 3.0 1.0 122.00
22 Plot Area Ready To Move Thanisandra 4 Bedroom Soitya 2800 5.0 2.0 380.00
23 Super built-up Area Ready To Move Bellandur 3 BHK NaN 1767 3.0 1.0 103.00
24 Super built-up Area 18-Nov Thanisandra 1 RK Bhe 2ko 510 1.0 0.0 25.25
25 Super built-up Area 18-May Mangammanapalya 3 BHK NaN 1250 3.0 2.0 56.00
26 Super built-up Area Ready To Move Electronic City 2 BHK Itelaa 660 1.0 1.0 23.10
27 Built-up Area 20-Dec Whitefield 3 BHK NaN 1610 3.0 2.0 81.00
28 Super built-up Area 17-Oct Ramagondanahalli 2 BHK ViistLa 1151 2.0 2.0 48.77
29 Super built-up Area Ready To Move Electronic City 3 BHK KBityo 1025 2.0 1.0 47.00
... ... ... ... ... ... ... ... ... ...
13289 Super built-up Area Ready To Move Sarjapur Road 4 BHK Maana E 4050 2.0 1.0 450.00
13290 Plot Area 18-Jan Weavers Colony 1 Bedroom NaN 812 1.0 0.0 26.00
13291 Super built-up Area 18-Jul Udayapur Village 3 BHK Plowsri 1440 2.0 2.0 63.93
13292 Super built-up Area Ready To Move Sarjapur Road 4 BHK Puallhi 2425 5.0 1.0 195.00
13293 Super built-up Area Ready To Move Sultan Palaya 4 BHK RSntsAp 2200 3.0 3.0 80.00
13294 Super built-up Area 18-Feb Haralur Road 3 BHK SNnia E 1810 3.0 2.0 112.00
13295 Super built-up Area Ready To Move Cox Town 2 BHK NaN 1200 2.0 2.0 140.00
13296 Super built-up Area Ready To Move Electronic City 2 BHK GMown E 1060 2.0 1.0 52.00
13297 Super built-up Area Ready To Move Kenchenahalli 2 BHK AriosPa 1015 2.0 2.0 60.00
13298 Super built-up Area 18-Dec Whitefield 4 BHK Prtates 2830 - 2882 5.0 0.0 154.50
13299 Plot Area Ready To Move Hosakerehalli 5 Bedroom NaN 1500 6.0 2.0 145.00
13300 Super built-up Area Ready To Move Kothanur 3 BHK NaN 1454 3.0 3.0 71.50
13301 Super built-up Area Ready To Move Annaiah Reddy Layout 2 BHK NaN 1075 2.0 2.0 48.00
13302 Plot Area Ready To Move Vidyaranyapura 5 Bedroom NaN 774 5.0 3.0 70.00
13303 Super built-up Area Ready To Move Raja Rajeshwari Nagar 2 BHK GrrvaGr 1187 2.0 2.0 40.14
13304 Carpet Area Ready To Move Hulimavu 1 BHK NaN 500 1.0 3.0 220.00
13305 Plot Area Ready To Move Rajarajeshwari Nagara 4 Bedroom NaN 1200 5.0 NaN 325.00
13306 Built-up Area Ready To Move Billekahalli 3 BHK NaN 1805 3.0 3.0 134.00
13307 Built-up Area Ready To Move Bannerghatta Road 3 BHK Baanise 1527 3.0 1.0 142.00
13308 Super built-up Area Ready To Move Yeshwanthpur 3 BHK IBityin 1675 3.0 NaN 92.13
13309 Super built-up Area Ready To Move Rachenahalli 2 BHK NaN 1050 2.0 2.0 52.71
13310 Plot Area Ready To Move Ramamurthy Nagar 7 Bedroom NaN 1500 9.0 2.0 250.00
13311 Super built-up Area Ready To Move Bellandur 2 BHK NaN 1262 2.0 2.0 47.00
13312 Super built-up Area Ready To Move Uttarahalli 3 BHK Aklia R 1345 2.0 1.0 57.00
13313 Super built-up Area Ready To Move Green Glen Layout 3 BHK SoosePr 1715 3.0 3.0 112.00
13314 Built-up Area Ready To Move Whitefield 5 Bedroom ArsiaEx 3453 4.0 0.0 231.00
13315 Super built-up Area Ready To Move Richards Town 4 BHK NaN 3600 5.0 NaN 400.00
13316 Built-up Area Ready To Move Raja Rajeshwari Nagar 2 BHK Mahla T 1141 2.0 1.0 60.00
13317 Super built-up Area 18-Jun Padmanabhanagar 4 BHK SollyCl 4689 4.0 1.0 488.00
13318 Super built-up Area Ready To Move Doddathoguru 1 BHK NaN 550 1.0 1.0 17.00
13318 rows × 9 columns
data.head()
area_type availability location size society total_sqft bath balcony price
0 Super built-up Area 19-Dec Electronic City Phase II 2 BHK Coomee 1056 2.0 1.0 39.07
1 Plot Area Ready To Move Chikka Tirupathi 4 Bedroom Theanmp 2600 5.0 3.0 120.00
2 Built-up Area Ready To Move Uttarahalli 3 BHK NaN 1440 2.0 3.0 62.00
3 Super built-up Area Ready To Move Lingadheeranahalli 3 BHK Soiewre 1521 3.0 1.0 95.00
4 Super built-up Area Ready To Move Kothanur 2 BHK NaN 1200 2.0 1.0 51.00
data['location'].isnull().sum()
0
data['total_sqft'].isnull().sum()
0
data['location'].fillna('', inplace=True)
data['location'] = data['location'].astype(str)
data.dtypes
area_type object
availability object
location object
size object
society object
total_sqft object
bath float64
balcony float64
price float64
dtype: object
enc = LabelEncoder()
data.iloc[:,2] = enc.fit_transform(data.iloc[:,2])
data.head()
area_type availability location size society total_sqft bath balcony price
0 Super built-up Area 19-Dec 420 2 BHK Coomee 1056 2.0 1.0 39.07
1 Plot Area Ready To Move 318 4 Bedroom Theanmp 2600 5.0 3.0 120.00
2 Built-up Area Ready To Move 1180 3 BHK NaN 1440 2.0 3.0 62.00
3 Super built-up Area Ready To Move 758 3 BHK Soiewre 1521 3.0 1.0 95.00
4 Super built-up Area Ready To Move 717 2 BHK NaN 1200 2.0 1.0 51.00
X = data.iloc[:,[0,1,2,3,4,5,6,7]]
X.head()
area_type availability location size society total_sqft bath balcony
0 Super built-up Area 19-Dec 420 2 BHK Coomee 1056 2.0 1.0
1 Plot Area Ready To Move 318 4 Bedroom Theanmp 2600 5.0 3.0
2 Built-up Area Ready To Move 1180 3 BHK NaN 1440 2.0 3.0
3 Super built-up Area Ready To Move 758 3 BHK Soiewre 1521 3.0 1.0
4 Super built-up Area Ready To Move 717 2 BHK NaN 1200 2.0 1.0
y = data.price
y.head()
0 39.07
1 120.00
2 62.00
3 95.00
4 51.00
Name: price, dtype: float64
y
0 39.07
1 120.00
2 62.00
3 95.00
4 51.00
5 38.00
6 204.00
7 600.00
8 63.25
9 370.00
10 70.00
11 295.00
12 38.00
13 40.00
14 148.00
15 73.50
16 48.00
17 60.00
18 290.00
19 48.00
20 15.00
21 122.00
22 380.00
23 103.00
24 25.25
25 56.00
26 23.10
27 81.00
28 48.77
29 47.00
...
13289 450.00
13290 26.00
13291 63.93
13292 195.00
13293 80.00
13294 112.00
13295 140.00
13296 52.00
13297 60.00
13298 154.50
13299 145.00
13300 71.50
13301 48.00
13302 70.00
13303 40.14
13304 220.00
13305 325.00
13306 134.00
13307 142.00
13308 92.13
13309 52.71
13310 250.00
13311 47.00
13312 57.00
13313 112.00
13314 231.00
13315 400.00
13316 60.00
13317 488.00
13318 17.00
Name: price, Length: 13319, dtype: float64
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3)
X_train
location bath balcony area_type_Carpet Area area_type_Plot Area area_type_Super built-up Area availability_14-Nov availability_15-Aug availability_15-Dec availability_15-Jun ... total_sqft_990 total_sqft_991 total_sqft_992 total_sqft_993 total_sqft_994 total_sqft_995 total_sqft_996 total_sqft_997 total_sqft_998 total_sqft_999
9114 665 2.0 1.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1324 418 4.0 0.0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4864 420 2.0 1.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11954 627 3.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11832 1191 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5065 1000 2.0 1.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
12548 1225 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9745 495 4.0 3.0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8851 973 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6012 418 4.0 3.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1956 708 2.0 1.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6861 389 5.0 2.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7343 642 2.0 2.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8963 638 5.0 1.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3634 418 2.0 1.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3300 1057 2.0 1.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11290 1169 2.0 1.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8659 1139 3.0 0.0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9313 1154 6.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9993 800 3.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4515 955 3.0 NaN 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2123 1191 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1189 1191 2.0 2.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4753 140 3.0 NaN 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8321 814 2.0 0.0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8268 516 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6184 585 3.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8542 543 2.0 3.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4812 1253 3.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
12648 708 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3147 537 2.0 1.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6309 1193 2.0 1.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2388 75 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10066 714 4.0 0.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11099 1253 6.0 NaN 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6000 746 3.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11849 1166 2.0 2.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4763 1253 2.0 1.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6444 116 2.0 1.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4279 687 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1842 1011 2.0 3.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2493 689 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7668 746 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1599 418 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
692 157 3.0 3.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9144 307 4.0 1.0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
13190 366 2.0 1.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6300 671 3.0 2.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4332 1253 2.0 3.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7061 849 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2566 1253 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
876 143 2.0 NaN 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7805 1154 4.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11385 1218 2.0 2.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3363 1253 1.0 0.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9366 893 2.0 1.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4579 269 2.0 1.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9362 189 1.0 1.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1070 221 2.0 2.0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9413 536 2.0 1.0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9323 rows × 4917 columns
X_train.dtypes
location int64
bath float64
balcony float64
area_type_Carpet Area uint8
area_type_Plot Area uint8
area_type_Super built-up Area uint8
availability_14-Nov uint8
availability_15-Aug uint8
availability_15-Dec uint8
availability_15-Jun uint8
availability_15-Nov uint8
availability_15-Oct uint8
availability_16-Dec uint8
availability_16-Jan uint8
availability_16-Jul uint8
availability_16-Mar uint8
availability_16-Nov uint8
availability_16-Oct uint8
availability_16-Sep uint8
availability_17-Apr uint8
availability_17-Aug uint8
availability_17-Dec uint8
availability_17-Feb uint8
availability_17-Jan uint8
availability_17-Jul uint8
availability_17-Jun uint8
availability_17-Mar uint8
availability_17-May uint8
availability_17-Nov uint8
availability_17-Oct uint8
...
total_sqft_967 uint8
total_sqft_970 uint8
total_sqft_971 uint8
total_sqft_972 uint8
total_sqft_973 uint8
total_sqft_975 uint8
total_sqft_976 uint8
total_sqft_977 uint8
total_sqft_978 uint8
total_sqft_980 uint8
total_sqft_980 - 1030 uint8
total_sqft_981 uint8
total_sqft_981 - 1249 uint8
total_sqft_982 uint8
total_sqft_983 uint8
total_sqft_984 uint8
total_sqft_985 uint8
total_sqft_986 uint8
total_sqft_987 uint8
total_sqft_989 uint8
total_sqft_990 uint8
total_sqft_991 uint8
total_sqft_992 uint8
total_sqft_993 uint8
total_sqft_994 uint8
total_sqft_995 uint8
total_sqft_996 uint8
total_sqft_997 uint8
total_sqft_998 uint8
total_sqft_999 uint8
Length: 4917, dtype: object
y_train
9114 35.00
1324 120.00
4864 37.83
11954 97.00
11832 54.00
5065 48.50
12548 71.95
9745 300.00
8851 45.00
6012 119.00
1956 53.33
6861 145.00
7343 45.00
8963 120.00
3634 68.00
3300 53.00
11290 67.00
8659 66.00
9313 245.00
9993 85.00
4515 700.00
2123 40.00
1189 42.00
4753 150.00
8321 175.00
8268 72.00
6184 168.00
8542 31.50
4812 140.00
12648 50.66
...
3147 86.12
6309 64.00
2388 43.50
10066 625.00
11099 700.00
6000 150.00
11849 59.80
4763 84.00
6444 48.00
4279 58.00
1842 50.00
2493 40.60
7668 75.00
1599 37.50
692 120.00
9144 89.45
13190 69.76
6300 92.00
4332 67.00
7061 63.00
2566 49.50
876 95.00
7805 152.00
11385 89.50
3363 32.79
9366 59.00
4579 42.00
9362 38.77
1070 70.00
9413 69.25
Name: price, Length: 9323, dtype: float64
linear = LinearRegression()
linear.fit(X_train, y_train)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-29-3975a5a27c36> in <module>()
1 linear = LinearRegression()
----> 2 linear.fit(X_train, y_train)
~/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/base.py in fit(self, X, y, sample_weight)
480 n_jobs_ = self.n_jobs
481 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
--> 482 y_numeric=True, multi_output=True)
483
484 if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
571 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
572 ensure_2d, allow_nd, ensure_min_samples,
--> 573 ensure_min_features, warn_on_dtype, estimator)
574 if multi_output:
575 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
446 # make sure we actually converted to numeric:
447 if dtype_numeric and array.dtype.kind == "O":
--> 448 array = array.astype(np.float64)
449 if not allow_nd and array.ndim >= 3:
450 raise ValueError("Found array with dim %d. %s expected <= 2."
ValueError: could not convert string to float:
Итак, как я могу исправить эту ошибку ValueError, где я получаю эту ошибку? чтобы предсказать обученные данные с помощью LinearRegression ()?