Я выбрал экземпляр t2.2xlarge с 8 процессорами и 32 ГБ в памяти. Тем не менее, я чувствую, что производительность такая же, как и у «бесплатной версии», на которой я запускал свой скрипт на Python. Когда я смотрю на использование процессора на моей машине, он говорит только 8%.
Как я могу использовать гораздо больше своих процессоров?
Вот следующий код, который я сейчас выполняю на этих экземплярах EC2:
def connectToDB():
databaseServerIP = "mydb.us-east-2.rds.amazonaws.com" # IP address of the MySQL database server
databaseUserName = "mydbUsername" # User name of the database server
databaseUserPassword = "mypwd" # Password for the database user
cursorType = pymysql.cursors.DictCursor
connectionInstance = pymysql.connect(host=databaseServerIP,
user=databaseUserName,
password=databaseUserPassword,
cursorclass=cursorType,
autocommit=True)
# Create a cursor object
cursorInstance = connectionInstance.cursor()
return connectionInstance, cursorInstance
def construct_each_company(tmpDF_forPeerGroup, ii):
print(tmpDF_forPeerGroup['Name'].values[ii])
finalBigDataframe = pd.DataFrame(date_generated, index = date_generated)
#symbolToCheck = tmpDF_forPeerGroup['Symbol'].values[ii]
idx = tmpDF_forPeerGroup.index[ii]
#####################
#####################
##### dataframe 1
try:
connectionInstance, cursorInstance = connectToDB()
sql = "SELECT * FROM DB1.Scores WHERE company_idx = "+str(idx)
finalBigDataframe_1 = pd.read_sql(sql, con=connectionInstance)
except:
finalBigDataframe_1 = None
#####################
#####################
##### dataframe 2
try:
connectionInstance, cursorInstance = connectToDB()
sql = "SELECT * FROM DB2.Scores WHERE company_idx = "+str(idx)
finalBigDataframe_2 = pd.read_sql(sql, con=connectionInstance)
except:
finalBigDataframe_2 = None
#####################
#####################
##### dataframe 3
try:
connectionInstance, cursorInstance = connectToDB()
sql = "SELECT * FROM DB3.Scores WHERE company_idx = "+str(idx)
finalBigDataframe_3 = pd.read_sql(sql, con=connectionInstance)
except:
finalBigDataframe_3 = None
#####################
#####################
##### dataframe 4
try:
connectionInstance, cursorInstance = connectToDB()
sql = "SELECT * FROM DB4.Scores WHERE company_idx = "+str(idx)
finalBigDataframe_4 = pd.read_sql(sql, con=connectionInstance)
except:
finalBigDataframe_4 = None
##################
##################
##################
##################
# merge for every input
# this is not right though...
tmpList_forThisCompany = [finalBigDataframe_1, finalBigDataframe_2, finalBigDataframe_3, finalBigDataframe_4]
return (ii, tmpList_forThisCompany)
def collect_result(result):
global results
results.append(result)
import multiprocessing as mp
for elem_PeerGroup in list(sorted(finalDict))[:]:
print(elem_PeerGroup)
#elem_PeerGroup = 'Africa - Banks'
########################################
### FOR ALL COMPANIES IN THIS PEER GROUP
tmpDF_forPeerGroup = finalDict[elem_PeerGroup]
if len(tmpDF_forPeerGroup)!=0:
########################
## CREATE A FINAL LIST FOR COMPANIES
#finalListForCompanies = []
########################
## CREATE DATETIME RANGE
start = datetime.strptime("01-01-2004", "%d-%m-%Y")
end = datetime.strptime("06-04-2019", "%d-%m-%Y")
date_generated = [start + timedelta(days=x) for x in range(0, (end-start).days)]
# each process will use each CPU
#pool = mp.Pool(mp.cpu_count())
pool = mp.Pool(2)
results=[]
for ii in range(0, len(tmpDF_forPeerGroup)):
pool.apply_async(construct_each_company, args=(tmpDF_forPeerGroup, ii), callback=collect_result)
pool.close()
# postpones the execution of next line of code until all processes in the queue are done.
pool.join()
# Step 5: Sort results [OPTIONAL]
results.sort(key=lambda x: x[0])
finalListForCompanies = [r for (ii, r) in results]
else:
continue
finalScores = []
# for each dataframe, NORMALIZE the companies in the PEER GROUP
for kk in range(4):
#print(kk)
tmpListForNormalisation=[]
for elem in finalListForCompanies:
tmpListForNormalisation.append(elem[kk])
dict_of_dfs = dict(enumerate(tmpListForNormalisation))
try:
dframes = pd.concat(dict_of_dfs)
except:
finalScores.append(None)
continue
dframes = dframes.iloc[:,1:]
if len(dframes)==0:
finalScores.append(None)
continue
if len(dframes)==len(dframes.groupby(level=1)):
arrayTest=[]
for k in range(len(tmpListForNormalisation)):
if (tmpListForNormalisation[k] is None) or (len(tmpListForNormalisation[k])==0):
arrayTest.append(None)
else:
arrayTest.append(tmpListForNormalisation[k])
# put the final result into a list
dict_of_dfs2 = dict(enumerate(arrayTest))
finalScores.append(dict_of_dfs2)
else:
test = dframes.groupby(level=1).pipe(lambda g: dframes.sub(g.mean(), level=1).div(g.std(), level=1))
tmpListForNormalisation2=[]
for date, new_df in test.groupby(level=0):
tmpListForNormalisation2.append(new_df)
arrayTest=[]
j=0
for k in range(len(tmpListForNormalisation)):
if (tmpListForNormalisation[k] is None) or (len(tmpListForNormalisation[k])==0):
arrayTest.append(None)
else:
arrayTest.append(tmpListForNormalisation2[j])
j+=1
test_min = test.min(level=1)
test_max = test.max(level=1)
dict_of_dfs2 = dict(enumerate(arrayTest))
def nrm(d):
_d = d
_d.index = _d.index.get_level_values(1)
NewRange = np.array([0, 100])
o = test_max - test_min
n = NewRange[1] - NewRange[0]
return (((_d - test_min) * n) / o) + NewRange[0]
for k, d in dict_of_dfs2.items():
if d is None:
continue
d.loc[:] = nrm(d).rolling(window=7).mean()
# put the final result into a list
finalScores.append(dict_of_dfs2)
# take the final MEAN for every company
for ll in range(len(tmpDF_forPeerGroup)):
namex = tmpDF_forPeerGroup['Name'].values[ll]
print("Inserting to DB...", namex)
company_idx = tmpDF_forPeerGroup['Company_idx'].values[ll]
company_symbol = tmpDF_forPeerGroup['Symbol'].values[ll]
industryName = tmpDF_forPeerGroup['GICS_Industry_Name'].values[ll]
try:
val1 = finalScores[0][ll]
except:
val1 = None
try:
val2 = finalScores[1][ll]
except:
val2 = None
try:
val3 = finalScores[2][ll]
except:
val3 = None
try:
val4 = finalScores[3][ll]
except:
val4 = None
tmpList = [val1, val2, val3, val4]
tmpDf = dict(enumerate(tmpList))
dframes = pd.concat(tmpDf)
finfin = dframes.mean(level=1)
# adjust according to its industry weights
finfin = adjustWeights(industryName, finfin)
# take data from 01.01.2007 onwards only
finfin = finfin['2007/01/01':]
#####################
# NOW PUT TO DATABASE
engine = create_engine("mysql://mydb.us-east-2.rds.amazonaws.com/"+newDatabaseName)
con = engine.connect()
finfin['timestamp'] = finfin.index
finfin['company_idx'] = [company_idx]*len(finfin)
finfin['company_symbol'] = [company_symbol]*len(finfin)
finfin.to_sql(name='Scores', con=con, if_exists='append', index=False)
Я не понимаю, почему в этом случае моя виртуальная машина использует только 8% моего процессора. Я не вижу ошибок в своем коде, так как он должен зацикливаться на многих различных компаниях и выделять один ЦП на компанию.