чтение и запись в sql с использованием pandas через многопроцессорность - PullRequest
0 голосов
/ 06 февраля 2020

Я имею дело с огромной таблицей, где я должен сделать запрос. Я решил сделать это, разбив мои данные на основе user_id и каждый раз читая и записывая в sql.

from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://') 


q1 = "SELECT max(id) FROM users"
max_users = pd.read_sql(q1, engine)
max_users = max_users.iloc[0][0]

# since user_ids start from 1 to ... I make the split based on that
data = range(max_users)
chunks = [list(data[x:x+1000]) for x in range(0, len(data), 1000)]


def make_q(userid):
    q2 = "SELECT alotofusers from bigtable WHERE userid in (" + str(','.join(str(e) for e in userid)) + ")"    



from multiprocessing import Pool, TimeoutError
import time
import os

table_name = "user_type_tmp6"
def f(q):
    df = pd.read_sql(q, engine)
    df.to_sql(con=engine, name=table_name, if_exists='append') 

pool = Pool(processes=10) # start 4 worker processes
pool.map(f, [make_q(item) for item in chunks[0:3]])

Фактически моя таблица заполняется только первым чанком, но я получаю следующую ошибку

Exception during reset or similar
Traceback (most recent call last):
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 680, in _finalize_fairy
    fairy._reset(pool)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 867, in _reset
    pool._dialect.do_rollback(self)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/dialects/mysql/base.py", line 2302, in do_rollback
    dbapi_connection.rollback()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 430, in rollback
    self._read_ok_packet()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 394, in _read_ok_packet
    pkt = self._read_packet()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 671, in _read_packet
    % (packet_number, self._next_seq_id))
pymysql.err.InternalError: Packet sequence number wrong - got 48 expected 1
multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1246, in _execute_context
    cursor, statement, parameters, context
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 581, in do_execute
    cursor.execute(statement, parameters)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 170, in execute
    result = self._query(query)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 328, in _query
    conn.query(q)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 517, in query
    self._affected_rows = self._read_query_result(unbuffered=unbuffered)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 732, in _read_query_result
    result.read()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 1075, in read
    first_packet = self.connection._read_packet()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 671, in _read_packet
    % (packet_number, self._next_seq_id))
pymysql.err.InternalError: Packet sequence number wrong - got 114 expected 1

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 733, in _rollback_impl
    self.engine.dialect.do_rollback(self.connection)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/dialects/mysql/base.py", line 2302, in do_rollback
    dbapi_connection.rollback()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 429, in rollback
    self._execute_command(COMMAND.COM_QUERY, "ROLLBACK")
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 750, in _execute_command
    raise err.InterfaceError("(0, '')")
pymysql.err.InterfaceError: (0, '')

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "user_app_usage_type.py", line 90, in f
    df = pd.read_sql(q, engine) # index_col = 'user_id'
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 436, in read_sql
    chunksize=chunksize,
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 1218, in read_query
    result = self.execute(*args)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 1087, in execute
    return self.connectable.execute(*args, **kwargs)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2182, in execute
    return connection.execute(statement, *multiparams, **params)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 976, in execute
    return self._execute_text(object_, multiparams, params)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1149, in _execute_text
    parameters,
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1250, in _execute_context
    e, statement, parameters, cursor, context
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1471, in _handle_dbapi_exception
    self._autorollback()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/langhelpers.py", line 79, in __exit__
    compat.reraise(type_, value, traceback)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 153, in reraise
    raise value
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1471, in _handle_dbapi_exception
    self._autorollback()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 861, in _autorollback
    self._root._rollback_impl()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 735, in _rollback_impl
    self._handle_dbapi_exception(e, None, None, None, None)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1384, in _handle_dbapi_exception
    exc_info,
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 398, in raise_from_cause
    reraise(type(exception), exception, tb=exc_tb, cause=cause)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 152, in reraise
    raise value.with_traceback(tb)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 733, in _rollback_impl
    self.engine.dialect.do_rollback(self.connection)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/dialects/mysql/base.py", line 2302, in do_rollback
    dbapi_connection.rollback()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 429, in rollback
    self._execute_command(COMMAND.COM_QUERY, "ROLLBACK")
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 750, in _execute_command
    raise err.InterfaceError("(0, '')")
sqlalchemy.exc.InterfaceError: (pymysql.err.InterfaceError) (0, '')
(Background on this error at: http://sqlalche.me/e/rvf5)
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "user_app_usage_type.py", line 109, in <module>
    pool.map(f, [make_q(item) for item in chunks[0:3]])
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 268, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 657, in get
    raise self._value
sqlalchemy.exc.InterfaceError: (pymysql.err.InterfaceError) (0, '')
(Background on this error at: http://sqlalche.me/e/rvf5)

Наверное, я делаю мультипроцессирование неправильно! Или, возможно, sqlalchemy не выровнен с пулом.

Обновление

Из моего понимания, читая это и это по предложению Ильи я обновил свою работающую функцию следующим образом:

def f(q):

    engine = create_engine('mysql+pymysql://')
    df = pd.read_sql(q, engine, index_col = 'user_id')
    df.fillna(0, inplace = True)
    df.to_csv('tmp.csv')
    df.to_sql(con=engine, name=table_name, if_exists='append' )
    engine.dispose()

, но теперь я получаю сообщения об ошибках типа

multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1246, in _execute_context
    cursor, statement, parameters, context
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 581, in do_execute
    cursor.execute(statement, parameters)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 170, in execute
    result = self._query(query)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 328, in _query
    conn.query(q)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 517, in query
    self._affected_rows = self._read_query_result(unbuffered=unbuffered)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 732, in _read_query_result
    result.read()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 1075, in read
    first_packet = self.connection._read_packet()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 684, in _read_packet
    packet.check_error()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/protocol.py", line 220, in check_error
    err.raise_mysql_exception(self._data)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/err.py", line 109, in raise_mysql_exception
    raise errorclass(errno, errval)
pymysql.err.InternalError: (1050, "Table 'users_usage_frequency_oly_12' already exists")

The above exception was the direct cause of the following exception:

    Traceback (most recent call last):
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "user_login_type.py", line 103, in f
    df.to_sql(con=engine, name=table_name, schema = 'datateam', if_exists='append' ) # dtype={'user_type': Enum('Browser', 'Hoarder', 'Mementor', 'Explorer', 'Lister', 'Scanner') }
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/core/generic.py", line 2712, in to_sql
    method=method,
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 518, in to_sql
    method=method,
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 1319, in to_sql
    table.create()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 656, in create
    self._execute_create()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 638, in _execute_create
    self.table.create()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/sql/schema.py", line 870, in create
    bind._run_visitor(ddl.SchemaGenerator, self, checkfirst=checkfirst)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2049, in _run_visitor
    conn._run_visitor(visitorcallable, element, **kwargs)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1618, in _run_visitor
    visitorcallable(self.dialect, self, **kwargs).traverse_single(element)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 138, in traverse_single
    return meth(obj, **kw)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/sql/ddl.py", line 826, in visit_table
    include_foreign_key_constraints,
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 982, in execute
    return meth(self, multiparams, params)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/sql/ddl.py", line 72, in _execute_on_connection
    return connection._execute_ddl(self, multiparams, params)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1044, in _execute_ddl
    compiled,
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1250, in _execute_context
    e, statement, parameters, cursor, context
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1476, in _handle_dbapi_exception
    util.raise_from_cause(sqlalchemy_exception, exc_info)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 398, in raise_from_cause
    reraise(type(exception), exception, tb=exc_tb, cause=cause)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 152, in reraise
    raise value.with_traceback(tb)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1246, in _execute_context
    cursor, statement, parameters, context
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 581, in do_execute
    cursor.execute(statement, parameters)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 170, in execute
    result = self._query(query)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 328, in _query
    conn.query(q)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 517, in query
    self._affected_rows = self._read_query_result(unbuffered=unbuffered)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 732, in _read_query_result
    result.read()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 1075, in read
    first_packet = self.connection._read_packet()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 684, in _read_packet
    packet.check_error()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/protocol.py", line 220, in check_error
    err.raise_mysql_exception(self._data)
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/err.py", line 109, in raise_mysql_exception
    raise errorclass(errno, errval)
sqlalchemy.exc.InternalError: (pymysql.err.InternalError) (1050, "Table 'tmp_oly_12' already exists")
[SQL: 
CREATE TABLE tmp_oly_12 (
    user_id BIGINT, 
    total_logins BIGINT, 
    distinct_month BIGINT, 
    freq TEXT, 
    lastlogin DATETIME, 
    typelastlog TEXT
)

]
(Background on this error at: http://sqlalche.me/e/2j85)
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "user_login_type.py", line 125, in <module>
    pool.map(f, [make_q(item) for item in chunks[0:3]])
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 268, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 657, in get
    raise self._value
sqlalchemy.exc.InternalError: (pymysql.err.InternalError) (1050, "Table 'users_usage_frequency_oly_12' already exists")
[SQL: 
CREATE TABLE tmp_oly_12 (
    user_id BIGINT, 
    total_logins BIGINT, 
    distinct_month BIGINT, 
    freq TEXT, 
    lastlogin DATETIME, 

. Я вижу, что таблица tmp_oly_12 заполнена не полностью, но все же Я получаю эту ошибку ...

...