Мне нравятся парсеры и теория компиляторов, поэтому я написал небольшой парсер (вручную), который может анализировать ваш пример фрагмента в
объект XML DOM Document. Можно изменить его так, чтобы он создавал древовидную структуру другого типа, например пользовательский AST (абстрактное синтаксическое дерево).
Я пытался сделать код легким для чтения, чтобы вы могли видеть, как работает такой парсер.
Спросите меня, если вам нужны дополнительные объяснения или вы хотите, чтобы я немного их изменил.
С вашим примером фрагмента в качестве ввода, оператор result = new OrgModParser().parse(input); result.xml
вернул:
<org-mode-document indentLevel="-1">
<section indentLevel="0">
<header indentLevel="0">This is a heading</header>
<paragraph indentLevel="1">P1 Start a paragraph here but since it is the first indentation level the paragraph may have a lower indentation on the next line or a greater one for that matter.</paragraph>
<list indentLevel="1">
<list-item indentLevel="1">
<paragraph indentLevel="2">LI1.1 I am beginning a list here</paragraph>
</list-item>
<list-item indentLevel="1">
<paragraph indentLevel="2">LI1.2 Here begins another list item which continues here and also here</paragraph>
</list-item>
</list>
<paragraph indentLevel="1">P2 but is broken here (this line becomes a paragraph outside of the first list).</paragraph>
<list indentLevel="1">
<list-item indentLevel="1">
<paragraph indentLevel="2">LI2.1 P1 Second list item.</paragraph>
<list indentLevel="2">
<list-item indentLevel="2">
<paragraph indentLevel="3">LI2.1.1 Inner list with a simple item</paragraph>
</list-item>
<list-item indentLevel="2">
<paragraph indentLevel="3">LI2.1.2 P1 and with an item containing several paragraphs. Here is the second line in the item, and now</paragraph>
<paragraph indentLevel="3">LI2.1.2 P2 I begin a new paragraph still in the same item. The indentation can be only higher</paragraph>
</list-item>
</list>
<paragraph indentLevel="2">LI2.1 P2 but if the indentation is lower, it breaks the item, (and the whole list), and this is a paragraph in the LI2.1 list item</paragraph>
<list indentLevel="2">
<list-item indentLevel="2">
<paragraph indentLevel="3">LI2.2.1 You get the picture</paragraph>
</list-item>
</list>
</list-item>
</list>
<paragraph indentLevel="1">P3 Just plain text outside of the list.</paragraph>
</section>
</org-mode-document>
код:
/*
* File: orgmodparser.js
* Basic usage: var object = new OrgModeParser().parse(input);
* Works on: JScript and JScript.Net.
* - For other JavaScript platforms, just replace or override the .createRoot() method
*/
OrgModeParser = function (options) {
if (typeof options == "object") {
for (var i in options) {
this[i] = options[i];
}
}
}
OrgModeParser.prototype = {
"INDENT_WIDTH" : 2, // Two spaces
"LINE_SEPARATOR" : "\r\n",
/*
* Each line in the input will be matched against this regexp.
* Only spaces are allowed as indentation characters.
* The symbols '*', '+' and '-' will be recognized, but only if they are followed by at least one space.
* Add other symbols in this regexp if you want the parser to recognize them
*/
"re" : /^( *)([\+\-\*] +)?(.*)/,
// This function must return a valid XML DOM document object
createRoot : function () {
var err, progIDs = ["Msxml2.DOMDocument.6.0", "Msxml2.DOMDocument.5.0", "Msxml2.DOMDocument.4.0", "Msxml2.DOMDocument.3.0", "Msxml2.DOMDocument.2.0", "Msxml2.DOMDocument.1.0", "Msxml2.DOMDocument"];
for (var i = 0; i < progIDs.length; i++) {
try {
return new ActiveXObject(progIDs[i]);
}
catch (err) {
}
}
alert("Org-mode parser - Error - Failed to instantiate root object");
return null;
},
parse : function (text) {
function createNode (tagName, text) {
var node = root.createElement(tagName);
node.setAttribute("indentLevel", level);
if (text) {
var textNode = root.createTextNode(text);
node.appendChild(textNode);
}
return node;
}
function getContainer () {
if (lastNode.tagName == "section") { return lastNode; }
var anc = lastNode.parentNode;
while (anc) {
if (modifier == "+" || modifier == "-") {
if (anc.getAttribute("indentLevel") == level && anc.tagName == "list") { return anc; }
}
if (anc.getAttribute("indentLevel") < level && anc.tagName != "paragraph") { return anc; }
anc = anc.parentNode;
}
alert("Org-mode parser - Internal error at line: "+i);return null;
}
if (typeof text != "string") { alert("Org-mode - Type error - Input must be of type 'string'"); return null; }
var body;
var content; // The text of the current line, without its indentation and modifier
var lastNode; // The node being processed
var indent; // The indentation of the current line
var isAfterDubbleLineBreak; // Indicates if the current line follows a dubble line break
var line; // The current line being processed
var level; // The current indentation level; given by indent.length / this.INDENT_WIDTH. Not to confuse with the nesting level
var lines; // Array. Empty lines are included.
var match;
var modifier; // This can be "*", "+", "-" or ""
var root;
isAfterDubbleLineBreak = false;
level = -1; // Indentation level is -1 initially; it will be 0 for the first "*"-bloc
lines = text.split(this.LINE_SEPARATOR);
root = this.createRoot();
body = root.appendChild(createNode("org-mode-document"));
lastNode = body;
for (var i = 0; i < lines .length; i++) {
line = lines[i];
match = line.match(this.re);
if (match === null) { alert("org-mode parse error at line: " + i); return null; }
indent = match[1];
level = indent.length / this.INDENT_WIDTH;
modifier = match[2] && match[2].charAt(0);
content = match[3];
// These conditions tell the parser what to do when encountering a line with a given modifer
if (content === "") { dubbleLineBreak(); continue; }
else if (modifier == "+" || modifier == "-") { plus(); }
else if (modifier == "*") { star(); }
else if (modifier == "+") { plus(); }
else if (modifier == "-") { minus(); }
else if (modifier == "") { noModifier(); }
isAfterDubbleLineBreak = false;
}
return root;
function star() {
// The '*' modifier is not allowed on an indented line
if (indent) { alert("Org-mode parse error: unexpected '*' symbol at line " + i); return null; }
lastNode = body.appendChild(createNode("section"));
// The div remains the current node
lastNode.appendChild(createNode("header", content));
}
function plus() {
var container = getContainer();
var tn = container.tagName;
if (tn == "section" || tn == "list-item") {
lastNode = container.appendChild(createNode("list"));
lastNode = lastNode.appendChild(createNode("list-item"));
lastNode = lastNode.appendChild(createNode("paragraph", content));
} else if (tn == "list") {
lastNode = container.appendChild(createNode("list-item"));
lastNode = lastNode.appendChild(createNode("paragraph", content));
}
else alert("Org-mode parser - Internal error - Bad container tag name: " + tn);
lastNode.setAttribute("indentLevel", Number(lastNode.getAttribute("indentLevel")) + 1);
}
function minus() { plus(); }
function noModifier() {
if (lastNode.tagName == "paragraph" && !isAfterDubbleLineBreak && (lastNode.getAttribute("indentLevel") == 1 || level >= lastNode.getAttribute("indentLevel"))) {
lastNode.childNodes[0].appendData(" " + content);
} else {
var container = getContainer();
lastNode = container.appendChild(createNode("paragraph", content));
}
}
function dubbleLineBreak() {
while (lines[i+1] && /^\s*$/.test(lines[i+1])) { i++; }
isAfterDubbleLineBreak = true;
}
}
};