Мой скрипт на python выполняется часами, так как код не эффективен Могу ли я сделать этот код быстрее?
texts = pd.read_csv("texts.csv", sep='\t') # has each line (100k total lines) with strings.Eg: ABVDGSVHSJ
vectors = pd.read_csv("word_n_vectors.csv", sep='\t') # has unique words (1000) in 1st column and corresponding...
# ... 100 dimension vector in next 100 columns
vectors_seq = open('output.csv', 'w') # save 100 dimension vector for each line by the following code
with vectors_seq:
writer = csv.writer(vectors_seq, delimiter="\t")
for sequence in texts["text"]: # "text" column contains strings: ABHDJHVD....
vec = np.zeros([1,100]) # to use when "word_n_vectors.csv" doesn't have the selected word.
i= 0
data = np.array([sequence[i:i+3] for i in range(0, len(sequence), 3)]) # split the data to 3 letter words
for tri in data:
a = vectors['0']==tri # get the location word in "vectors"
if not any(a):
a=np.zeros([1,100]) # if no word present in "vectors"
else:
a = np.where(a)[0]
a = vectors.iloc[a,1:101] # get the 100 D vectors from "vectors"
a = np.array(a)
vec = np.add(vec,a) # add all the 100 D vectors in each line in loop
writer.writerows(vec) # and save new 100 D vectors in each new row
print("Writing complete")