Я написал код, который отслеживает веб-сайты и генерирует результаты в формате html с выделенным красным и зеленым цветом текстом.
Теперь генерируемый html является необработанным html без css и javascript и выглядит неструктурированным и трудным для чтения.
Как я могу отобразить результаты, как в оригинальном HTML с CSS и JavaScript.
Ниже мой код:
import re;
from tkinter import *;
import tkinter.filedialog as fdial;
import os;
import tkinter.scrolledtext as ScrolledText;
import urllib,requests
import urllib.parse as Parser
from urllib.parse import urlparse
import time
from lxml.html.diff import htmldiff
import difflib
threshold = 0* 60;
def getLastUpdateFile(path):
print(">"+path,len(os.listdir(path)));
latestFile={"Name":None,"time":0};
for i in os.listdir(path):
fullName=path+i;
if i.startswith("result"):
latestFile['result']=fullName;
continue;
currFile={"Name":fullName, "time" : os.path.getmtime(fullName)}
if currFile["time"] > latestFile["time"]:
latestFile=currFile.copy();
#print(i,currFile);
return latestFile;
class webMonitor:
def __init__(self):
self.root = Tk();
self.urlFil="";
self.timer=1;
self.stopScan=False;
self.filName=StringVar();
self.filName.set("NoFile selected");
self.root.title("Web URL Tracker");
self.root.geometry("500x400");
self.root.minsize(800,400);
self.root.maxsize(1000,400);
BarFrame = Frame(self.root,borderwidth=2, relief="groove");
BarFrame.pack(side=TOP);
label = Label(BarFrame,textvariable=self.filName,bg="white",relief = RAISED,height=1);
label.grid(row=0,column=0);
self.fileButton = Button(BarFrame,text="select url file", width=10, command = self.getUrlFile, bg="black", fg="white", activebackground = "grey", relief = RAISED, height=1);
self.fileButton.grid(row=1,column=0);
MainWindow=Frame(self.root,borderwidth=2, relief="groove");
MainWindow.pack(side=RIGHT);
urlWindow=Frame(MainWindow,borderwidth=2, relief="groove");
urlWindow.pack(side=LEFT,fill=BOTH);
scrollbar=Scrollbar(urlWindow);
scrollbar.pack(side=LEFT,fill=Y);
self.urlText = Listbox(urlWindow,width=35, yscrollcommand = scrollbar.set)
self.urlText.pack(side=LEFT,fill=BOTH);
scrollbar.config(command = self.urlText.yview );
LogWindow= Frame(MainWindow,borderwidth=2, relief="groove");
LogWindow.pack(side=LEFT,fill=X);
timerWindow = Frame(LogWindow,borderwidth=2, relief="groove");
timerWindow.pack(fill=X);
timeText=Button(timerWindow,text="Timer",activebackground="white",bg="steelBlue3",width=10,command=self.setTimer);
timeText.pack(fill=X);
self.timeEntry=Entry(timerWindow,justify=CENTER);
self.timeEntry.pack(fill=X);
self.timeEntry.delete(0, END)
self.timeEntry.insert(0, "60")
self.LoggerButton=Button(LogWindow,text="Web Scan", bg="steelBlue3",command=self.Scanner, activebackground = "white", relief = RAISED, height=1);
self.LoggerButton.pack(side=TOP);
self.Logs = Text(LogWindow,state=DISABLED);
scrollbar1=Scrollbar(LogWindow,command=self.Logs.yview);
scrollbar1.pack(side=RIGHT,fill=Y);
self.Logs.pack(expand=True,side=BOTTOM,fill=BOTH);
self.Logs['yscrollcommand'] = scrollbar1.set;
def setTimer(self):
timer=None;
try:
self.timer=float(self.timeEntry.get());
except Exception as e:
self.timeEntry.delete(0, END);
self.timeEntry.insert(0, self.timer);
def getUrlFile(self):
urlFil=fdial.askopenfile(parent=self.root,mode='rb',title='Choose a file');
if not urlFil:
return;
self.urlFil=urlFil.name;
self.filName.set(self.urlFil);
print(self.urlFil);
self.loadFile();
self.stopScan=True;
def logInsert(self,line):
self.Logs.config(state=NORMAL);
self.Logs.insert(END,line+"\n");
self.Logs.config(state=DISABLED);
def clearLog(self):
self.Logs.config(state=NORMAL);
self.Logs.delete(1.0,END);
self.Logs.config(state=DISABLED);
def separator(self):
self.logInsert("-"*25+"\n");
def preRequisites(self):
self.clearLog();
self.dataSet=self.mainDirectory+"\\dataSet\\";
self.LogSet=self.mainDirectory+"\\logs\\";
self.CsvLogSet = self.mainDirectory+'\\csvLogSet\\'
if not os.path.isdir(self.dataSet):
self.logInsert("\tcreating archive storage");
os.mkdir(self.dataSet);
self.separator();
if not os.path.isdir(self.CsvLogSet):
self.logInsert("\tcreating CSV archive storage");
os.mkdir(self.CsvLogSet);
self.separator();
if not os.path.isdir(self.LogSet):
self.logInsert("\tcreating a log folder");
os.mkdir(self.LogSet);
self.separator();
self.separator();
def Scanner(self):
self.stopScan=False;
#while True:
if not self.urlFil:
return;
self.mainDirectory=os.path.dirname(self.urlFil);
print(self.mainDirectory,os.path.isdir(self.mainDirectory));
if not os.path.isdir(self.mainDirectory):
self.getUrlFile();
return;
self.LoggerButton.config(state=DISABLED);
#try:
self.preRequisites();
self.generateDataset();
self.LoggerButton.config(state=NORMAL);
#except Exception as Exep:
# print(Exep);
#finally:
#for i in range(int(self.timer)):
# for j in range(6):
# print ("j",j);
# if self.stopScan:
# self.stopScan=False;
# self.LoggerButton.config(state=NORMAL);
# return;
# time.sleep(10);
print("in scanner",self.urlFil);
def generateDataset(self):
self.tstr = time.time();
self.tStamp=time.strftime('%d-%m-%Y %H-%M-%S', time.gmtime(self.tstr));
self.logInsert("Generating Log: "+self.LogSet+self.tStamp+".txt");
logFile = open(self.LogSet+self.tStamp+".txt",'w+', encoding="utf-8");
logCsvFile = open(self.CsvLogSet+self.tStamp+".csv",'w+',encoding="utf-8");
logCsvFile.write("Url,status,fileName\n");
cssJs = re.compile(r"href[\s]*=[\s]*\"[\s]*|src[\s]*=[\s]*\"[\s]*");
for i in self.AllUrls:
self.separator();
domainName = urlparse(i).netloc;
print(domainName);
print(i);
self.logInsert(i);
logFile.write(i+"\n")
if len(i)==0:
continue;
try:
ScriptDetected = [];
print(">>>>",i);
webSite = requests.get(i);
webHtml=webSite.content.decode("latin-1").replace('\r','');
for m in cssJs.finditer(webHtml):
#print(m.group());
newLink=m.group();
#print("2 ",newLink);
linkStart = webHtml[m.span()[1]:m.span()[1]+15];
#print(">",linkStart);
if linkStart.startswith("//"):
newLink= newLink+'http:';
elif linkStart.startswith("/"):
newLink= newLink+'http://'+domainName+"/";
print(linkStart,m.group(),newLink)
elif not (linkStart.startswith("http:") or linkStart.startswith("https:")):
newLink= newLink+i+"/";
ScriptDetected.append((m.start(),newLink,m.start()+len(m.group()),linkStart));
#print("::",(m.start(),newLink,m.start()+len(m.group())))
#print(2,newLink);
#print(newLink,"\n",m.group(),"\n\n");
parseHtml="";
startHtml = 0
for m in ScriptDetected:
parseHtml =parseHtml+ webHtml[startHtml:m[0]];
parseHtml =parseHtml + m[1];
startHtml=m[2];
parseHtml=parseHtml+ webHtml[startHtml:];
webHtml = parseHtml;
print();
except Exception as e:
print(e);
print("can't open url: "+i+"\n");
self.logInsert("can't open url: "+i+"\n");
logFile.write("can't open url: "+i+"\n");
logFile.write("-"*25+"\n");
continue;
databaseName = i.replace("http://","").replace("https://","").replace("www.","");
databaseName = Parser.quote_plus(databaseName).replace(".","_").replace("%2F","-");
dbName= self.dataSet+databaseName+"\\";
if not os.path.isdir(dbName):
os.mkdir(dbName);
newArchieve=dbName+self.tStamp+".html";
#print(databaseName);
latestFile=getLastUpdateFile(dbName);
#print(latestFile);
if latestFile['Name'] and latestFile['time'] + threshold > self.tstr:
print("\tarchieve already updated less than an hour ago");
logCsvFile.write(i+",archieve already updated less than an hour ago,"+latestFile['Name']);
self.logInsert("\tarchieve already updated less than an hour ago\n");
logFile.write("\tarchieve already updated less than an hour ago\n");
continue;
currentHtml = open(newArchieve,"w+");
currentHtml.write(webHtml);
currentHtml.close();
if not latestFile['Name']:
print("\tNo archives found");
logCsvFile.write(i+",creating first Copy,"+newArchieve+"\n");
self.logInsert("\tNo archives found");
logFile.write("\tNo archives found\n");
continue;
prevHtml = open(latestFile['Name'],'r');
prevHtmlScan = prevHtml.read();
prevHtml.close();
if prevHtmlScan == webHtml:
print("\tno new changes found");
self.logInsert("\tno new changes found");
logFile.write("\tno new changes found\n");
logCsvFile.write(i+",no new changes found,"+latestFile['Name']+"\n");
continue;
resultFileName= dbName+"result_"+ latestFile['Name'].split("\\")[-1].split('.')[0] +"_To_"+ self.tStamp+".html";
print("\tresult_FileName:"+resultFileName);
self.logInsert("\tresult_FileName:"+resultFileName);
logFile.write("\tresult_FileName"+resultFileName+"\n");
newResult = open(resultFileName,"w+", encoding="utf-8");
newResult.write(htmldiff(webHtml,prevHtmlScan).replace("<ins>","<ins style=\"background-color: green\">").replace("<del>","<del style=\"background-color: red\">"));
newResult.close();
if 'result' in latestFile:
#logFile.write("\tremoving old result file"+latestFile['result']+"\n");
print("\tremoving old result file"+latestFile['result']);
self.logInsert("\tremoving old result file"+latestFile['result']);
os.remove(latestFile['result']);
logFile.write("-"*25+"\n");
logFile.close();
logCsvFile.close();
def loadFile(self):
self.urlText.delete(0,'end')
self.AllUrls=set();
Fp = open(self.urlFil,"r");
for line in Fp:
line=line.replace("\n","").replace(" ","");
if line=="":
continue;
line= line.lower();
if not line.startswith("http"):
if line.startswith("://"):
line="http";
else:
line ="http://"+line;
self.AllUrls.add(line);
self.urlText.insert(END,line);
Fp.close();
def run(self):
self.root.mainloop();
if __name__ =="__main__":
gui = webMonitor();
gui.run();