Я не смог найти программу, которая бы делала то, что вы хотели.Итак, я сделал один. И теперь это работает!
#!python3
from html.parser import HTMLParser
class HTMLPass(HTMLParser):
def __init__(self, *a, convert_charrefs=False, **k):
super().__init__(*a, convert_charrefs=convert_charrefs, **k)
def handle_starttag(self, tag, attrs):
print(end=self.get_starttag_text())
@staticmethod
def handle_endtag(tag):
print(end="</" + tag + ">")
handle_startendtag = handle_starttag
@staticmethod
def handle_data(data):
print(end=data)
@staticmethod
def handle_entityref(name):
print(end="&"+name+";")
@staticmethod
def handle_charref(name):
print(end="&#"+name+";")
@staticmethod
def handle_comment(data):
print(end="<!--"+data+"-->")
@staticmethod
def handle_decl(decl):
print(end="<!"+decl+">")
@staticmethod
def handle_pi(data):
print(end="<?"+data+">")
unknown_decl = handle_decl
class HTMLPassMod(HTMLPass):
def __init__(self, *a, argv=None, **k):
super().__init__(*a, **k)
self.stack = []
self.args = debugremoveme = []
if argv is None:
import sys
argv = sys.argv[1:]
for arg in argv:
# Horrible string parsing
# Should turn "/a#link-1.external/d" into
# [d, ['a', ('id', 'link-1'), ('class', 'external')]]
sel, act = arg[1:].split(arg[0])
self.args.append([act])
for selector in sel.split(">"):
self.args[-1].append([])
selector = selector.strip()
if "." not in selector and "#" not in selector:
self.args[-1][-1].append(selector)
continue
if "." not in selector:
self.args[-1][-1][:] = selector.split("#")
self.args[-1][-1][1:] = zip(["id"]*(len(self.args[-1][-1])-1), self.args[-1][-1][1:])
continue
if "#" not in selector:
self.args[-1][-1][:] = selector.split(".")
self.args[-1][-1][1:] = zip(["class"]*(len(self.args[-1][-1])-1), self.args[-1][-1][1:])
continue
if selector.index(".") < selector.index("#"):
tag, selector = selector.split(".", maxsplit=1)
selector = "." + selector
else:
tag, selector = selector.split("#", maxsplit=1)
selector = "#" + selector
self.args[-1][-1].append(tag)
while selector:
if "#" not in selector:
self.args[-1][-1].extend(zip(["class"]*len(selector), selector.split(".")))
break
if "." not in selector:
self.args[-1][-1].extend(zip(["id"]*len(selector), selector.split("#")))
break
if selector[0] == ".":
if "." not in selector[1:] or selector.index("#") < selector.index("."):
axa, selector = selector[1:].split("#", maxsplit=1)
else:
axa, selector = selector[1:].split(".", maxsplit=1)
self.args[-1][-1].append(("class", axa))
else:
if "#" not in selector[1:] or selector.index(".") < selector.index("#"):
axa, selector = selector[1:].split(".", maxsplit=1)
else:
axa, selector = selector[1:].split("#", maxsplit=1)
self.args[-1][-1].append(("id", axa))
def handle_starttag(self, tag, attrs):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
# kill means kill
self.stack.append((tag, attrs, None))
return
self.stack.append((tag, attrs, None))
for arg in self.args:
for frame, a in zip(self.stack[::-1], arg[:0:-1]):
a_tag = a[0].replace("*", "").strip()
if a_tag and frame[0] != a_tag:
break
for attr, val in frame[1]:
if attr == "class":
frame_classes = val.split()
break
else:
frame_classes = []
for attr, val in a[1:]:
if attr == "class":
if val not in frame_classes:
break
else:
for a, v in frame[1]:
if a == attr and v == val:
break
else:
break
else:
continue
break
else:
self.stack[-1] = (tag, attrs, arg[0])
if arg[0][0] in "drk": # delete / replace / kill
if arg[0][0] == "r":
print(end=arg[0][1:])
return
if arg[0][0] == "i": # insert (inside / after)
super().handle_starttag(tag, attrs)
print(end=arg[0][2:].split(arg[0][1])[0])
break
else:
super().handle_starttag(tag, attrs)
def handle_startendtag(self, tag, attrs):
self.handle_starttag(tag, attrs)
self.stack.pop()
def handle_endtag(self, tag):
if self.stack[-1][0] != tag:
# TODO: Implement proper HTML-isn't-XML behaviour
pass
frame = self.stack.pop()
if frame[2] is None:
return super().handle_endtag(tag)
if frame[2][0] in "drk": # delete / replace / kill
return
if frame[2][0] == "i":
super().handle_endtag(tag)
print(end=frame[2][2:].split(frame[2][1])[1])
def handle_data(self, data):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_data(data)
def handle_entityref(self, name):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_entityref(name)
def handle_charref(self, name):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_charref(name)
def handle_comment(self, data):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_comment(data)
def handle_decl(self, decl):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_data(decl)
def handle_pi(self, data):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_pi(data)
def unknown_decl(self, data):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().unknown_decl(data)
def run(pass_through=HTMLPassMod):
x = pass_through()
while True:
try:
i = input()
except EOFError:
break
x.feed(i + '\n')
x.close()
if __name__ == "__main__":
run()
Этот код ужасен , но на самом деле будет работать правильно, в том числе во многих крайних случаях.
Примериспользование:
wizzwizz4@wizzwizz4Laptop:~$ cat example_input.html
<div class="bg-detail2" id="geometry">
<div class="container">
<h2>Title</h2>
<div class="line"></div>
<div class="fix"></div>
<div class="col50">
Content
</div>
<div class="col50">
Another Content
</div>
</div>
</div>
wizzwizz4@wizzwizz4Laptop:~$ <example_input.html ./rubbish_program.py ~div.newdiv~r<h2>Title</h2>
<div class="bg-detail2" id="geometry">
<div class="container">
<h2>Title</h2>
<div class="line"></div>
<div class="fix"></div>
<div class="col50">
Content
</div>
<div class="col50">
Another Content
</div>
</div>
</div>
wizzwizz4@wizzwizz4Laptop:~$ cat example_input_2.html
<div class="bg-detail2" id="geometry">
<div class="container">
<h2>Title</h2>
<div class="line"></div>
<div class="fix"></div>
<div class="col50">
Content
</div>
<div class="col50">
Another Content
</div>
</div>
</div>
wizzwizz4@wizzwizz4Laptop:~$ <example_input_2.html ./rubbish_program.py 'Jdiv.containerJi~<div class="newdiv">~</div>' '\.container > h2\k'
<div class="bg-detail2" id="geometry">
<div class="container"><div class="newdiv">
<div class="line"></div>
<div class="fix"></div>
<div class="col50">
Content
</div>
<div class="col50">
Another Content
</div>
</div></div>
</div>
Синтаксис
./rubbish_program.py [argument...]
, где argument
имеет вид:
<separator><selector><separator><instruction>
, где: