У меня есть начальный элемент GeoDataFrame (GDF) (RED), который содержит глобальную сетку 0,5 ar c минут ((180 * 2) * (360 * 2) = 259200). Каждая ячейка содержит абсолютную оценку населения. Кроме того, у меня есть «пиявка» GDF (ЗЕЛЕНЫЙ) с примерно 8250 смежными нерегулярными формами различных размеров (водоразделов).
Я написал скрипт для распределения оценок популяции по геометриям в пиявке GDF. на основе области перекрытия между ячейками сетки (начальный GDF) и геометриями в пиявке GDF. Скрипт отлично работает для моих примеров данных (см. Ниже). Однако, как только я запускаю его на своих реальных данных, это очень медленно. Я запускал его всю ночь, и на следующее утро было выполнено только 27% расчетов. Мне придется многократно запускать этот сценарий и ждать два дня каждый раз, просто не вариант.
После небольшого изучения литературы я уже заменил (?) Для циклов на for index i in df.iterrows()
( или это то же самое, что и "обычный" python для циклов), но это не привело к повышению производительности, на которое я надеялся.
Есть какие-нибудь предположения, как я могу ускорить свой код? За двенадцать часов мой сценарий обработал только ~ 30000 строк из ~ 200000.
Мой ожидаемый результат - столбец leech_df['leeched_values']
.
import geopandas as gpd
import time
from datetime import datetime
from shapely.geometry import Polygon
# =============================================================================
# Geometries for testing
# =============================================================================
polys1 = gpd.GeoSeries([Polygon([(0.00,0.00), (0.00,0.25), (0.25,0.25), (0.25,0.00)]),
Polygon([(0.00,0.25), (0.00,0.50), (0.25,0.50), (0.25,0.25)]),
Polygon([(0.00,0.50), (0.00,0.75), (0.25,0.75), (0.25,0.50)]),
Polygon([(0.25,0.00), (0.25,0.25), (0.50,0.25), (0.50,0.00)]),
Polygon([(0.25,0.25), (0.25,0.50), (0.50,0.50), (0.50,0.25)]),
Polygon([(0.25,0.50), (0.25,0.75), (0.50,0.75), (0.50,0.50)]),
Polygon([(0.50,0.00), (0.50,0.25), (0.75,0.25), (0.75,0.00)]),
Polygon([(0.50,0.25), (0.50,0.50), (0.75,0.50), (0.75,0.25)]),
Polygon([(0.50,0.50), (0.50,0.75), (0.75,0.75), (0.75,0.50)]),
])
polys2 = gpd.GeoSeries([Polygon([(0.125,0.125), (0.125,0.375), (0.375,0.375), (0.375,0.125)]),
Polygon([(0.050,0.550), (0.050,0.700), (0.200,0.700), (0.200,0.550)]),
Polygon([(0.25,0.625), (0.25,0.375), (0.750,0.375), (0.750,0.625)]),
])
seed_df = gpd.GeoDataFrame({'geometry': polys1, 'seed_value':[10,10,10,10,10,10,10,10,10]})
leech_df = gpd.GeoDataFrame({'geometry': polys2})
del polys1, polys2
seed_value = 'seed_value'
# =============================================================================
# Prepare DataFrames
# =============================================================================
start = time.time()
print('\n\nPrepare DataFrames ... ')
# Create a new index for the seed DF
# The old index will be copied into the seed DF as a new column
# and transferred into the merged DF
seed_df = seed_df.reset_index()
seed_df = seed_df.rename(columns={'index': 'seed_index'})
# Create a new index for the seed DF
# The old index will be copied into the seed DF as a new column
# and transferred into the merged DF
leech_df = leech_df.reset_index()
leech_df = leech_df.rename(columns={'index': 'leech_index'})
end = time.time()
print(end - start)
# Add the area to the geometries
start = time.time()
print('Calculating the area of the leech DF geometries ...')
leech_df['leech_area'] = leech_df['geometry'].area
print('Calculating the area of the seed DF geometries ...')
seed_df['seed_area'] = seed_df['geometry'].area
leech_df['leeched_value'] = 0.0
end = time.time()
print(end - start)
# =============================================================================
# Merge seed and leech data and count overlaps
# =============================================================================
start = time.time()
print('Merging DataFrames ... ')
merged_df = gpd.overlay(leech_df, seed_df, how='union')
# Drop NaNs
merged_df = merged_df.dropna(axis='rows')
# =============================================================================
# Allocate seed values
# =============================================================================
# Count with how many leech geometries each seed geometry overlaps
print('Count overlaps ... ')
overlaps = merged_df['seed_index'].value_counts()
neglected_values = list()
one_overlaps_values = list()
more_overlaps_values = list()
no_overlaps = list()
one_overlaps = list()
more_overlaps = list()
end = time.time()
print(end - start)
start = time.time()
print('Allocate seed values ... ')
i = 0
for index, row in seed_df.iterrows(): # For each row in seed_df MAX 70123
if row['seed_index'] % 1000 == 0:
print(str(row['seed_index'])+'k at '+str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
# If seed geometry does not have an overlap
# Get the value with the seed_index == 0
# Otherwise return 0
# So whenever, the seedindex does not turn up in overlaps return zero
if overlaps.get(row['seed_index'],0) == 0: # If seed geometry does not have an overlap
#print('Grid cell ' + str(i) + ' - ' + str(row['seed_index']) + ': No overlap')
no_overlaps.append(1) # Count the values which have not been considered
neglected_values.append(row[seed_value]) # Count the values which have not been considered
i = i + 1
elif overlaps.get(row['seed_index'],0) == 1:
#print('Grid cell ' + str(i) + ' - ' + str(row['seed_index']) + ': One overlap')
one_overlaps.append(1)
# What is for row the leech index (with which leech geometry does an overlap exist)?
temp_int = int(merged_df[merged_df['seed_index'] == row['seed_index']]['leech_index'])
# For this leech index replace leeched_value with leeched_value + leeched_value
seed_value_amount = int(seed_df[seed_df['seed_index'] == row['seed_index']][seed_value])
leech_df.loc[temp_int,'leeched_value'] = leech_df.loc[temp_int,'leeched_value'] + seed_value_amount
one_overlaps_values.append(row[seed_value]) # Count the values which have not been considered
i = i + 1
elif overlaps.get(row['seed_index'],0) > 1:
#print('Grid cell ' + str(i) + ' - ' + str(row['seed_index']) + ': More than one overlap')
more_overlaps.append(1)
# Create a DF which contains the overlaps of the seed geometries with geoemtries of the leech df
temp_df = merged_df[merged_df['seed_index'] == row['seed_index']]
# Calculate the overlaying area
temp_df['overlay_area'] = temp_df['geometry'].area
# And comapre this to the total overlaying area of this grid cell
temp_df['rel_contribution'] = 0.0
temp_df['rel_contribution'] = temp_df['overlay_area']/sum(temp_df.area)
# Calcualte the absolute distribution of the seed value to each leech row ('leech index')
temp_df['abs_contribution'] = temp_df[seed_value]*temp_df['rel_contribution']
# Reset index
temp_df = temp_df.reset_index(drop=True)
more_overlaps_values.append(row[seed_value]) # Count the values which have not been considered
i = i + 1
#For each overlap between grid cell (seed) and leech geometry:
for j in range(len(leech_df)):
#print('== LEECH ' + str(j) + '.' +str(i))
# Create a DF which only contains the temp_df row for the specific leech geometry
contribution = temp_df[temp_df['leech_index'] == j]
# Save the contribution to the leech_df
#leech_df['leeched_value'][j] = leech_df.loc[:,('leeched_value')][j] + sum(contribution['abs_contribution'])
leech_df.loc[j,'leeched_value'] = leech_df.loc[:,('leeched_value')][j] + sum(contribution['abs_contribution'])
end = time.time()
print(end - start)
print('Done')
# =============================================================================
# Data check
# =============================================================================
print('>1 Overlaps: ' + str(sum(more_overlaps)) + ' (sum neglected values ' + str(sum(more_overlaps_values)) + ')' )
print('=1 Overlaps: ' + str(sum(one_overlaps)) + ' (sum neglected values ' + str(sum(one_overlaps_values)) + ')' )
print('No Overlaps: ' + str(sum(no_overlaps)) + ' (sum neglected values ' + str(sum(neglected_values)) + ')' )
print('Unconsidered: ' + str(sum(seed_df[seed_value]) - (sum(more_overlaps_values)+sum(one_overlaps_values)+sum(neglected_values))))
# =============================================================================
# Plot
# =============================================================================
# Plot
base = seed_df.plot(color='red', edgecolor='black');
leech_df.plot(ax=base, color='green', alpha=0.5);
РЕДАКТИРОВАТЬ: pip freeze > requirements_output.txt
возвращает:
affine==2.3.0
alabaster==0.7.12
appdirs==1.4.3
argh==0.26.2
asn1crypto==1.3.0
astroid==2.3.3
atomicwrites==1.3.0
attrs==19.3.0
autopep8==1.4.4
Babel==2.8.0
backcall==0.1.0
bcrypt==3.1.7
bleach==3.1.0
cachetools==4.0.0
Cartopy==0.17.0
certifi==2019.11.28
cffi==1.13.2
cftime==1.0.4.2
chardet==3.0.4
Click==7.0
click-plugins==1.1.1
cligj==0.5.0
cloudpickle==1.2.2
colorama==0.4.3
country-converter==0.6.6
cryptography==2.8
cycler==0.10.0
dask==2.10.0
datacube==1.7
decorator==4.4.1
defusedxml==0.6.0
Deprecated==1.2.7
descartes==1.1.0
diff-match-patch==20181111
docutils==0.16
entrypoints==0.3
Fiona==1.8.13
flake8==3.7.9
future==0.18.2
GDAL==3.0.2
geocube==0.0.10
geopandas==0.6.2
h5py==2.9.0
idna==2.8
imageio==2.6.1
imagesize==1.2.0
importlib-metadata==1.4.0
intervaltree==3.0.2
ipykernel==5.1.4
ipython==7.11.1
ipython-genutils==0.2.0
isort==4.3.21
jedi==0.14.1
Jinja2==2.10.3
joblib==0.14.1
jsonschema==3.2.0
jupyter-client==5.3.4
jupyter-core==4.6.1
keyring==21.1.0
kiwisolver==1.1.0
lark-parser==0.8.1
lazy-object-proxy==1.4.3
lxml==4.4.2
mapclassify==2.2.0
MarkupSafe==1.1.1
matplotlib==3.1.1
mccabe==0.6.1
mistune==0.8.4
mkl-fft==1.0.15
mkl-random==1.1.0
mkl-service==2.3.0
more-itertools==8.0.2
munch==2.5.0
nbconvert==5.6.1
nbformat==5.0.4
netCDF4==1.5.3
notebook==6.0.0
numpy==1.16.4
numpydoc==0.9.2
olefile==0.46
OWSLib==0.19.1
packaging==20.1
pandas==0.25.0
pandocfilters==1.4.2
paramiko==2.6.0
parso==0.6.0
pathtools==0.1.2
pexpect==4.8.0
pickleshare==0.7.5
Pillow==7.0.0
pluggy==0.13.1
prometheus-client==0.7.1
prompt-toolkit==3.0.3
psutil==5.6.7
psycopg2==2.8.4
pycodestyle==2.5.0
pycparser==2.19
pydocstyle==4.0.1
pyepsg==0.4.0
pyflakes==2.1.1
Pygments==2.5.2
pykdtree==1.3.1
pylint==2.4.4
pymatreader==0.0.20
Pympler==0.7
pymrio==0.4.0
PyNaCl==1.3.0
pyOpenSSL==19.1.0
pyparsing==2.4.6
pyPEG2==2.15.2
pyproj==2.4.2.post1
PyQt5-sip==4.19.19
pyrsistent==0.15.7
pyshp==2.1.0
PySocks==1.7.1
python-dateutil==2.8.1
python-jsonrpc-server==0.3.4
python-language-server==0.31.7
pytz==2019.3
pywin32==227
pywin32-ctypes==0.2.0
pywinpty==0.5.7
PyYAML==5.2
pyzmq==18.1.0
QDarkStyle==2.8
QtAwesome==0.6.1
qtconsole==4.6.0
QtPy==1.9.0
rasterio==1.1.2
requests==2.22.0
rioxarray==0.0.19
rope==0.16.0
Rtree==0.9.3
scikit-learn==0.22.1
scipy==1.3.2
Send2Trash==1.5.0
Shapely==1.7.0
singledispatch==3.4.0.3
six==1.14.0
snowballstemmer==2.0.0
snuggs==1.4.7
sortedcontainers==2.1.0
Sphinx==2.3.1
sphinxcontrib-applehelp==1.0.1
sphinxcontrib-devhelp==1.0.1
sphinxcontrib-htmlhelp==1.0.2
sphinxcontrib-jsmath==1.0.1
sphinxcontrib-qthelp==1.0.2
sphinxcontrib-serializinghtml==1.1.3
spyder==4.0.0
spyder-kernels==1.8.1
spyder-notebook==0.1.4
SQLAlchemy==1.3.13
terminado==0.8.3
testpath==0.4.4
toolz==0.10.0
tornado==6.0.3
tqdm==4.43.0
traitlets==4.3.3
ujson==1.35
urllib3==1.25.8
watchdog==0.9.0
wcwidth==0.1.7
webencodings==0.5.1
win-inet-pton==1.1.0
wincertstore==0.2
wrapt==1.11.2
xarray==0.14.1
xlrd==1.2.0
xmltodict==0.12.0
yapf==0.28.0
zipp==0.6.0