import nltk
nltk.download('punkt')
pip install ocrspace
import nltk
nltk.download('stopwords')
import pandas as pd
import numpy as np
df = pd.read_csv("All.csv")
df
get_features = df.iloc[0:2]
get_features
df = df[pd.notnull(df['Bank Detail'])]
df.info
col = ['Bank Detail', 'Classes']
df = df[col]
df['classes'] = df['Bank Detail'].factorize()[0]
df
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from sklearn.metrics import accuracy_score
import pickle
from sklearn.model_selection import train_test_split
import cv2
import pytesseract
from pytesseract import image_to_string
stopWords = set(nltk.corpus.stopwords.words('english'))
vect = TfidfVectorizer(sublinear_tf=True, encoding='utf-8',
decode_error='ignore',stop_words=stopWords)
X_train, X_test, y_train, y_test = train_test_split(df["Bank Detail"],
df["Classes"], test_size=0.33, random_state=42)
xTrain = X_train
yTrain = y_train
tfidf = vect.fit(xTrain.values.astype('U'))
xTrainvect = vect.fit_transform(xTrain)
yTrainvect = yTrain
xTestvect = vect.transform(X_test)
yTestvect = y_test
model = MultinomialNB(alpha=0.01, fit_prior=True)
model.fit(xTrainvect, yTrainvect)
ypred = model.predict(xTestvect)
score = accuracy_score(yTestvect, ypred)
print ("Accuracy: ",score)
def test_predict():
test = "DEBIT CARD PURCHASE AT BUFFALO WILD WINGS, FARMINGTON HI, MI ON
061919 . "
new_pred = model.predict(vect.transform([test]))
print(new_pred)
test_predict()
img = cv2.imread("bank_sheet.jpg")
#extracing text data from the image
def extracting_text_from_image():
import ocrspace
api = ocrspace.API()
api = ocrspace.API('6f80b6ff6288957', ocrspace.Language.English)
api.ocr_file('bank_sheet.jpg')
banktext = api.ocr_file('bank_sheet.jpg')
print(banktext)
extracting_text_from_image()
#get_text = pytesseract.image_to_string(banktest)
dates = []
amounts = []
description = []
# tokenizing the string and saperately getting dates, description and amount
def getting_dates_description_amount_separately():
a_list = nltk.tokenize.line_tokenize(banktest)
for s in a_list:
if(model.predict(vect.transform([s])) == 'Date'):
#print(s)
dates.append(s)
if(model.predict(vect.transform([s])) == 'Amount'):
amounts.append(s)
if(model.predict(vect.transform([s])) == 'Description'):
description.append(s)
#print(s)
print(dates, '\n')
print(description, '\n')
print(amounts, '\n')
#print(a_list)
#print(type(a_list))
getting_dates_description_amount_separately()
import pickle
with open('untitled19.pkl', 'wb') as f:
pickle.dump(model,f)
import csv
with open('mycsv.csv', 'w', newline='') as f:
fieldname = ['Dates', 'Description', 'Amount']
thewriter = csv.DictWriter(f, fieldnames=fieldname)
thewriter.writeheader()
thewriter.writerow({'Dates':dates, 'Description': description,
'Amount':amounts})
data_frame = pd.read_csv("mycsv.csv")
data_frame