Question

Я работаю над проектом, в который я хочу импортировать огромное количество данных в arangodb. Все данные в форме .xlsx с несколькими листами. Поэтому я написал скрипт, который преобразует файлы .xlsx в json файлы (по одному json файлу для каждого листа), а затем устанавливает соединение с aran go db и выполняет массовый импорт данных в arangodb. Поэтому я написал скрипт в блокноте jupyter, установленном в windows p c с последней версией Anaconda, и он работает как шарм с локальными или удаленными подключениями к базе данных. Поэтому, увидев, что код работает, я скопировал скрипт на виртуальный сервер CentOS7, запустил его и он вышел из строя. Я запустил его на физической машине с Ubuntu 19.10, и он тоже рухнул. Обе машины linux были обновлены и работали также с последней версией Anaconda. Также скрипт запускался как из командной строки как файл .py, так и как файл .ipynb из блокнота jupyter на всех машинах (windows и linux). В windows он отлично работает, в linux он падает, когда начинает конвертировать первый файл .xlsx. Код для скрипта такой:

from zipfile import ZipFile
from bs4 import BeautifulSoup
import pandas as pd
from xlsx2csv import Xlsx2csv as x2csv
import os
import hashlib
import json
import numpy as np
from arango import ArangoClient
import glob

filelist = []
hash_dict = {}
current_folder = os.getcwd()
for file in os.listdir(current_folder):
    if file.endswith(".xlsx"):
        filelist.append(file)

#create a list of all worksheets contained in the worksheet
def create_sheet_list(file):
    with ZipFile(file) as zipped_file:
        summary = zipped_file.open(r'xl/workbook.xml').read()
    soup = BeautifulSoup(summary, "xml")
    sheets = [sheet.get("name") for sheet in soup.find_all("sheet")]
    return sheets

#create an array of dataframes from all the worksheets
def create_dataframes(file):
    xl = pd.ExcelFile(file)
    xl.sheet_names
    dfs = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
    return dfs

def create_json(file,sheets,dfs):
    print(("The file contains {} sheets").format(len(sheets)))
    count = 0
    for i in sheets:
        json_filelist = []
        count = count + 1
#produce the dataframe and check if there are any encoding errors
        try:
            df = dfs[i]
            new_header = df.iloc[0]
            df = df[1:]
            df.columns = new_header
            df = df.fillna(0)
            hash_str_name = file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_'))
            hash_str=int(hashlib.sha1(hash_str_name.encode('utf-8')).hexdigest(), 16) % (10 ** 10)
            values = str(hash_str)
            df['Hash']=np.nan
            df['Hash']= df['Hash'].fillna(value=values)
            #hash_dict.update({hash_str_name : values})
            hash_dict[hash_str_name] = values
            json_file = df.reset_index().to_json(new_path+"/"+file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_')), orient = "records")
#For the dataframes that will get an error because of encoding a different way of conversion will be used
        except UnicodeEncodeError:
            x2csv(file, outputencoding="utf-8").convert(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')),count)
            df = pd.read_csv(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')), header = 1)
            hash_str_name = file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_'))
            hash_str=int(hashlib.sha1(hash_str_name.encode('utf-8')).hexdigest(), 16) % (10 ** 10)
            values = str(hash_str)
            df['Hash']=np.nan
            df['Hash']= df['Hash'].fillna(value=values)
            #hash_dict.update({hash_str_name : values})
            hash_dict[hash_str_name] = values
            json_file = df.reset_index().to_json(new_path+"/"+file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_')), orient = "records")
            os.remove(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')))

#Create connection with the Database
def create_db_connection():
    client = ArangoClient(hosts='http://127.0.0.1:8529')
    db = client.db('CEM', username='root', password='123456')
    return db

#Get the list of the .json files from all the folders
def list_of_json():
    path =  os.getcwd()
    folders = os.listdir(path)
    json_names = []
    for folder in folders:
        files = glob.glob(path+"/"+folder+"/"+"*.json")
        if len(files)>0:
            json_names.append(files)
    return json_names

#Get the list of the collections in the database
def list_of_collections(sheets,db):
    for col in sheets:
        col = col.replace(' ','_')
        if db.has_collection(col):
            collect = db.collection(col)
        else:
            collect = db.create_collection(col)
    collection = db.collections()
    collection = [i['name'] for i in collection if i['name'][0].isupper()]
    return collection

#Import the data from the .json files to the appropriate collections
def import_data(json_names,collection, db):
    for x in json_names:
        for y in x:
            for z in collection:
                with open(y, "r") as json_file:
                    if y.endswith("{}.json".format(z)):
                        data = json.load(json_file)
                        z = db.collection(z)
                        z.import_bulk(data)

for file in filelist:
    try:
#create the folder where the .json files from that UFED will be stored
        new_folder = os.mkdir(os.getcwd()+"/"+file.strip('.xlsx'))
#get the path for the new folder
        new_path = "{0}/{1}".format(os.getcwd(), file.strip('.xlsx'))
    except FileExistsError:
#if the folder already exists just get its path
        new_path = "{0}/{1}".format(os.getcwd(), file.strip('.xlsx'))
    print(new_path)
#print the name of the file that's being analyzed so that we have a measure of progress
    print(("Now I am working with {} file").format(file))
#call the functions and run the program
    create_sheet_list(file)
    create_dataframes(file)
    sheets = create_sheet_list(file)
    dfs = create_dataframes(file)
    create_json(file,sheets,dfs)
df_dict = pd.DataFrame(list(hash_dict.items()), index = None, columns = ["File_name", "Hash_num"])
df_dict.to_json(current_folder+"/hash_list.json", orient = "records")
create_db_connection()
db = create_db_connection()
#create_collections(sheets,db)
list_of_json()
json_names = list_of_json()
list_of_collections(sheets,db)
collection = list_of_collections(sheets,db)
import_data(json_names,collection,db)

Может кто-нибудь помочь?

python скрипт работает на windows, но не на linux

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

python скрипт работает на windows, но не на linux

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Нет похожих вопросов