У меня есть решение на C. Казалось бы, слишком много сложностей, чтобы этого было легко достичь в сценарии оболочки.Программа не слишком сложна, но, тем не менее, содержит более 200 строк кода, которые включают проверку ошибок, некоторую оптимизацию скорости и другие тонкости.
Исходный файл split-brackets-to-chunks.c:
#include <stdio.h>
/* Example code by jcxz100 - your problem if you use it! */
#define BUFF_IN_MAX 255
#define BUFF_IN_SIZE (BUFF_IN_MAX+1)
#define OUT_NAME_MAX 31
#define OUT_NAME_SIZE (OUT_NAME_MAX+1)
#define NO_CHAR '\0'
int main()
{
char pcBuff[BUFF_IN_SIZE];
size_t iReadActual;
FILE *pFileIn, *pFileOut;
int iNumberOfOutputFiles;
char pszOutName[OUT_NAME_SIZE];
char cLiteralChar, cAtomicChar, cChunkStartChar, cChunkEndChar;
int iChunkNesting;
char *pcOutputStart;
size_t iOutputLen;
pcBuff[BUFF_IN_MAX] = '\0'; /* ... just to be sure. */
iReadActual = 0;
pFileIn = pFileOut = NULL;
iNumberOfOutputFiles = 0;
pszOutName[OUT_NAME_MAX] = '\0'; /* ... just to be sure. */
cLiteralChar = cAtomicChar = cChunkStartChar = cChunkEndChar = NO_CHAR;
iChunkNesting = 0;
pcOutputStart = (char*)pcBuff;
iOutputLen = 0;
if ((pFileIn = fopen("input-utf-8.txt", "r")) == NULL)
{
printf("What? Where?\n");
return 1;
}
while ((iReadActual = fread(pcBuff, sizeof(char), BUFF_IN_MAX, pFileIn)) > 0)
{
char *pcPivot, *pcStop;
pcBuff[iReadActual] = '\0'; /* ... just to be sure. */
pcPivot = (char*)pcBuff;
pcStop = (char*)pcBuff + iReadActual;
while (pcPivot < pcStop)
{
if (cLiteralChar != NO_CHAR) /* Ignore this char? */
{
/* Yes, ignore this char. */
if (cChunkStartChar != NO_CHAR)
{
/* ... just write it out: */
fprintf(pFileOut, "%c", *pcPivot);
}
pcPivot++;
cLiteralChar = NO_CHAR;
/* End of "Yes, ignore this char." */
}
else if (cAtomicChar != NO_CHAR) /* Are we inside an atomic string? */
{
/* Yup; we are inside an atomic string. */
int bBreakInnerWhile;
bBreakInnerWhile = 0;
pcOutputStart = pcPivot;
while (bBreakInnerWhile == 0)
{
if (*pcPivot == '\\') /* Treat next char as literal? */
{
cLiteralChar = '\\'; /* Yes. */
bBreakInnerWhile = 1;
}
else if (*pcPivot == cAtomicChar) /* End of atomic? */
{
cAtomicChar = NO_CHAR; /* Yes. */
bBreakInnerWhile = 1;
}
if (++pcPivot == pcStop) bBreakInnerWhile = 1;
}
if (cChunkStartChar != NO_CHAR)
{
/* The atomic string is part of a chunk. */
iOutputLen = (size_t)(pcPivot-pcOutputStart);
fprintf(pFileOut, "%.*s", iOutputLen, pcOutputStart);
}
/* End of "Yup; we are inside an atomic string." */
}
else if (cChunkStartChar == NO_CHAR) /* Are we inside a chunk? */
{
/* No, we are outside a chunk. */
int bBreakInnerWhile;
bBreakInnerWhile = 0;
while (bBreakInnerWhile == 0)
{
/* Detect start of anything interesting: */
switch (*pcPivot)
{
/* Start of atomic? */
case '"':
case '\'':
cAtomicChar = *pcPivot;
bBreakInnerWhile = 1;
break;
/* Start of chunk? */
case '{':
cChunkStartChar = *pcPivot;
cChunkEndChar = '}';
break;
case '[':
cChunkStartChar = *pcPivot;
cChunkEndChar = ']';
break;
case '(':
cChunkStartChar = *pcPivot;
cChunkEndChar = ')';
break;
case '<':
cChunkStartChar = *pcPivot;
cChunkEndChar = '>';
break;
}
if (cChunkStartChar != NO_CHAR)
{
iNumberOfOutputFiles++;
printf("Start '%c' '%c' chunk (file %04d.txt)\n", *pcPivot, cChunkEndChar, iNumberOfOutputFiles);
sprintf((char*)pszOutName, "output/%04d.txt", iNumberOfOutputFiles);
if ((pFileOut = fopen(pszOutName, "w")) == NULL)
{
printf("What? How?\n");
fclose(pFileIn);
return 2;
}
bBreakInnerWhile = 1;
}
else if (++pcPivot == pcStop)
{
bBreakInnerWhile = 1;
}
}
/* End of "No, we are outside a chunk." */
}
else
{
/* Yes, we are inside a chunk. */
int bBreakInnerWhile;
bBreakInnerWhile = 0;
pcOutputStart = pcPivot;
while (bBreakInnerWhile == 0)
{
if (*pcPivot == cChunkStartChar)
{
/* Increase level of brackets/parantheses: */
iChunkNesting++;
}
else if (*pcPivot == cChunkEndChar)
{
/* Decrease level of brackets/parantheses: */
iChunkNesting--;
if (iChunkNesting == 0)
{
/* We are now outside chunk. */
bBreakInnerWhile = 1;
}
}
else
{
/* Detect atomic start: */
switch (*pcPivot)
{
case '"':
case '\'':
cAtomicChar = *pcPivot;
bBreakInnerWhile = 1;
break;
}
}
if (++pcPivot == pcStop) bBreakInnerWhile = 1;
}
iOutputLen = (size_t)(pcPivot-pcOutputStart);
fprintf(pFileOut, "%.*s", iOutputLen, pcOutputStart);
if (iChunkNesting == 0)
{
printf("File done.\n");
cChunkStartChar = cChunkEndChar = NO_CHAR;
fclose(pFileOut);
pFileOut = NULL;
}
/* End of "Yes, we are inside a chunk." */
}
}
}
if (cChunkStartChar != NO_CHAR)
{
printf("Chunk exceeds end-of-file. Exiting gracefully.\n");
fclose(pFileOut);
pFileOut = NULL;
}
if (iNumberOfOutputFiles == 0) printf("Nothing to do...\n");
else printf("All done.\n");
fclose(pFileIn);
return 0;
}
Я решил приятных для обладания и один из более дальних приятных для обладания ,Чтобы показать это, ввод немного сложнее, чем пример в вопросе:
junk text
"atomic junk"
some junk text followed by a start bracket { here is the actual payload
more payload
'atomic payload { with start bracket that should be ignored'
nested start bracket { - all of this line is untouchable payload too
here is more payload
"this atomic has a literal double-quote \" inside"
"yet more atomic payload; this one's got a smiley ;-) and a heart <3"
end of nested bracket pair } - all of this line is untouchable payload too
this is payload too
"here's a totally unprovoked $ sign and an * asterisk"
} trailing junk
intermittent junk
<
payload that goes in second output file } mismatched end bracket should be ignored >
end junk
Результирующий файл output / 0001.txt :
{ here is the actual payload
more payload
'atomic payload { with start bracket that should be ignored'
nested start bracket { - all of this line is untouchable payload too
here is more payload
"this atomic has a literal double-quote \" inside"
"yet more atomic payload; this one's got a smiley ;-) and a heart <3"
end of nested bracket pair } - all of this line is untouchable payload too
this is payload too
"here's a totally unprovoked $ sign and an * asterisk"
}
... и полученный файл output / 0002.txt :
<
payload that goes in second output file } mismatched end bracket should be ignored >
Спасибо @dawg за помощь:)