Я попробовал ниже двумя способами
df_weather = pd.read_sql_table('cityweathers', self.engine)
def process_city_weather(df_weather):
df_weather.loc[:, "date_create"] = pd.to_datetime(df_weather[["year", "month","day"]])
# df_weather.loc[:, "update"] = pd.to_datetime(df_weather.time_create, unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)
df_weather = df_weather.drop_duplicates(['date_create'], keep='last')
# method 1
df_weather.loc[:, 'fineday'] = df_weather.weather.apply(lambda x: '雨' not in x).astype(int)
# method 2
df_weather['fineday'] = df_weather.weather.apply(lambda x: '雨' not in x).astype(int)
Оба не работают,
Я включил pd.options.mode.chained_assignment = 'raise'
, чтобы узнать, откуда приходят предупреждения. Ошибка на первом пути:
Traceback (most recent call last):
File "map_reduce.py", line 743, in <module>
main()
File "map_reduce.py", line 738, in main
delayed(purchase_count_forecaster)(order_path, stock_path) for order_path, stock_path in read_map_by_store_id()
File "C:\Anaconda3\lib\site-packages\joblib\parallel.py", line 983, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Anaconda3\lib\site-packages\joblib\parallel.py", line 825, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Anaconda3\lib\site-packages\joblib\parallel.py", line 782, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 182, in apply_async
result = ImmediateResult(func)
File "C:\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 545, in __init__
self.results = batch()
File "C:\Anaconda3\lib\site-packages\joblib\parallel.py", line 261, in __call__
for func, args, kwargs in self.items]
File "C:\Anaconda3\lib\site-packages\joblib\parallel.py", line 261, in <listcomp>
for func, args, kwargs in self.items]
File "map_reduce.py", line 391, in purchase_count_forecaster
df_weather = process_city_weather(df_weather)
File "e:\pp\sales-forecast\mlc\preprocess\weather.py", line 14, in process_city_weather
df_weather['fineday'] = df_weather.weather.apply(lambda x: '雨' not in x).astype(int)
File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 3119, in __setitem__
self._set_item(key, value)
File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 3201, in _set_item
self._check_setitem_copy()
File "C:\Anaconda3\lib\site-packages\pandas\core\generic.py", line 2712, in _check_setitem_copy
raise com.SettingWithCopyError(t)
pandas.core.common.SettingWithCopyError:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Я видел Как бороться с SettingWithCopyWarning в Pandas? , но он не покрывает мой случай
UPDATE
Функция ввода, как показано ниже
from joblib import Parallel, delayed
def main():
process_num = 1 if DEBUG else 4
Parallel(n_jobs=process_num)(
delayed(purchase_count_forecaster)(order_path, stock_path) for order_path, stock_path in read_map_by_store_id()
)
def purchase_count_forecaster(order_path, stock_path):
hdfs = HdfsClient(host=HDFS_HOST, port=HDFS_PORT)
db = Database(DB_URL)
df_weather = db.read_weathers() # pd.read_sql_table here
df_weather = process_city_weather(df_weather)
# ....
ENV:
In [2]: pd.show_versions()
INSTALLED VERSIONS
------------------
commit: None
python: 3.6.5.final.0
python-bits: 64
OS: Windows
OS-release: 10
machine: AMD64
processor: Intel64 Family 6 Model 45 Stepping 7, GenuineIntel
byteorder: little
LC_ALL: None
LANG: None
LOCALE: None.None
pandas: 0.23.4
pytest: 3.5.1
pip: 18.1
setuptools: 39.1.0
Cython: 0.28.2
numpy: 1.14.3
scipy: 1.1.0
pyarrow: 0.11.0
xarray: None
IPython: 6.4.0
sphinx: 1.7.4
patsy: 0.5.0
dateutil: 2.7.3
pytz: 2018.4
blosc: None
bottleneck: 1.2.1
tables: 3.4.3
numexpr: 2.6.5
feather: None
matplotlib: 2.2.2
openpyxl: 2.5.3
xlrd: 1.1.0
xlwt: 1.3.0
xlsxwriter: 1.0.4
lxml: 4.2.1
bs4: 4.6.0
html5lib: 1.0.1
sqlalchemy: 1.2.7
pymysql: None
psycopg2: None
jinja2: 2.10
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None