C ++ scanner.h сканирует содержимое между двойными кавычками как токен: не пропуская пробелы внутри кавычек - PullRequest
0 голосов
/ 05 декабря 2011

Я пытаюсь получить содержимое между двойными кавычками, чтобы считать один токен для назначения.

Например: "hello world" = 1 токен "hello" "world" = 3 токена (потому что пробел считается как 1 токен)

Я создал main.cpp и добавил «scanQuotesAsString»код для 3 модулей:

  • scanner.cpp
  • scanner.h
  • scanpriv.h

Прямо сейчас, "приветМир "сканирует 2 жетона, не пропуская пробел.Если я добавлю (или пропущу пробел, то обычный ввод, такой как | hello world | без кавычек, также пропускает пробелы.

Я думаю, что моя проблема в файле scanner.cpp, где используются последние две функции:

/*
* Private method: scanToEndOfIdentifier
* Usage: finish = scanToEndOfIdentifier();
* ----------------------------------------
* This function advances the position of the scanner until it
* reaches the end of a sequence of letters or digits that make
* up an identifier. The return value is the index of the last
* character in the identifier; the value of the stored index
* cp is the first character after that.
*/
int Scanner::scanToEndOfIdentifier() {
    while (cp < len && isalnum(buffer[cp])) {
        if ((stringOption == ScanQuotesAsStrings) && (buffer[cp] == '"')) 
            break;
        cp++;
    }
    return cp - 1;
}


/* Private functions */
/*
* Private method: scanQuotedString
* Usage: scanQuotedString();
* -------------------
* This function advances the position of the scanner until the
* current character is a double quotation mark
*/
void Scanner::scanQuotedString() {
    while ((cp < len && (buffer[cp] == '"')) || (cp < len && (buffer[cp] == '"'))){
        cp++;
    }

Вот main.cc

#include "genlib.h"
#include "simpio.h"
#include "scanner.h"
#include <iostream>

/* Private function prototypes */

int CountTokens(string str);

int main() {
    cout << "Please enter a sentence: ";
    string str = GetLine();

    int num = CountTokens(str);
    cout << "You entered " << num << " tokens." << endl;
    return 0;
}

int CountTokens(string str) {

    int count = 0;
    Scanner scanner;        // create new scanner object            
    scanner.setInput(str);  // initialize the input to be scanned

    //scanner.setSpaceOption(Scanner::PreserveSpaces);
    scanner.setStringOption(Scanner::ScanQuotesAsStrings);

    while (scanner.hasMoreTokens()) { // read tokens from the scanner
        scanner.nextToken();
        count++;
    }
    return count;
}

Вот scanner.cpp

/*
* File: scanner.cpp
* -----------------
* Implementation for the simplified Scanner class.
*/
#include "genlib.h"
#include "scanner.h"
#include <cctype>
#include <iostream>
/*
* The details of the representation are inaccessible to the client,
* but consist of the following fields:
*
* buffer -- String passed to setInput
* len -- Length of buffer, saved for efficiency
* cp -- Current character position in the buffer
* spaceOption -- Setting of the space option extension
*/
Scanner::Scanner() {
    buffer = "";
    spaceOption = PreserveSpaces;
}
Scanner::~Scanner() {
/* Empty */
}
void Scanner::setInput(string str) {
    buffer = str;
    len = buffer.length();
    cp = 0;
}
/*
* Implementation notes: nextToken
* -------------------------------
* The code for nextToken follows from the definition of a token.
*/
string Scanner::nextToken() {
    if (cp == -1) {
        Error("setInput has not been called");
    }
    if (stringOption == ScanQuotesAsStrings) scanQuotedString();
    if (spaceOption == IgnoreSpaces) skipSpaces();
    int start = cp;
    if (start >= len) return "";
    if (isalnum(buffer[cp])) {
        int finish = scanToEndOfIdentifier();
        return buffer.substr(start, finish - start + 1);
    }
    cp++;
    return buffer.substr(start, 1);
}

bool Scanner::hasMoreTokens() {
    if (cp == -1) {
        Error("setInput has not been called");
    }
    if (stringOption == ScanQuotesAsStrings) scanQuotedString();
    if (spaceOption == IgnoreSpaces) skipSpaces();
    return (cp < len);
}

void Scanner::setSpaceOption(spaceOptionT option) {
    spaceOption = option;
}

Scanner::spaceOptionT Scanner::getSpaceOption() {
    return spaceOption;
}

void Scanner::setStringOption(stringOptionT option) {
    stringOption = option;
}

Scanner::stringOptionT Scanner::getStringOption() {
    return stringOption;
}


/* Private functions */
/*
* Private method: skipSpaces
* Usage: skipSpaces();
* -------------------
* This function advances the position of the scanner until the
* current character is not a whitespace character.
*/
void Scanner::skipSpaces() {
    while (cp < len && isspace(buffer[cp])) {
        cp++;
    }
}

    /*
    * Private method: scanToEndOfIdentifier
    * Usage: finish = scanToEndOfIdentifier();
    * ----------------------------------------
    * This function advances the position of the scanner until it
    * reaches the end of a sequence of letters or digits that make
    * up an identifier. The return value is the index of the last
    * character in the identifier; the value of the stored index
    * cp is the first character after that.
    */
    int Scanner::scanToEndOfIdentifier() {
        while (cp < len && isalnum(buffer[cp])) {
            if ((stringOption == ScanQuotesAsStrings) && (buffer[cp] == '"')) 
                break;
            cp++;
        }
        return cp - 1;
    }


    /* Private functions */
    /*
    * Private method: scanQuotedString
    * Usage: scanQuotedString();
    * -------------------
    * This function advances the position of the scanner until the
    * current character is a double quotation mark
    */
    void Scanner::scanQuotedString() {
        while ((cp < len && (buffer[cp] == '"')) || (cp < len && (buffer[cp] == '"'))){
            cp++;
        }

scanner.h

/*
* File: scanner.h
* ---------------
* This file is the interface for a class that facilitates dividing
* a string into logical units called "tokens", which are either
*
* 1. Strings of consecutive letters and digits representing words
* 2. One-character strings representing punctuation or separators
*
* To use this class, you must first create an instance of a
* Scanner object by declaring
*
* Scanner scanner;
*
* You initialize the scanner's input stream by calling
*
* scanner.setInput(str);
*
* where str is the string from which tokens should be read.
* Once you have done so, you can then retrieve the next token
* by making the following call:
*
* token = scanner.nextToken();
*
* To determine whether any tokens remain to be read, you can call
* the predicate method scanner.hasMoreTokens(). The nextToken
* method returns the empty string after the last token is read.
*
* The following code fragment serves as an idiom for processing
* each token in the string inputString:
*
* Scanner scanner;
* scanner.setInput(inputString);
* while (scanner.hasMoreTokens()) {
* string token = scanner.nextToken();
* . . . process the token . . .
* }
*
* This version of the Scanner class includes an option for skipping
* whitespace characters, which is described in the comments for the
* setSpaceOption method.
*/
#ifndef _scanner_h
#define _scanner_h
#include "genlib.h"
/*
* Class: Scanner
* --------------
* This class is used to represent a single instance of a scanner.
*/
class Scanner {
public:
/*
* Constructor: Scanner
* Usage: Scanner scanner;
* -----------------------
* The constructor initializes a new scanner object. The scanner
* starts empty, with no input to scan.
*/
    Scanner();
/*
* Destructor: ~Scanner
* Usage: usually implicit
* -----------------------
* The destructor deallocates any memory associated with this scanner.
*/
    ~Scanner();
/*
* Method: setInput
* Usage: scanner.setInput(str);
* -----------------------------
* This method configures this scanner to start extracting
* tokens from the input string str. Any previous input string is
* discarded.
*/
    void setInput(string str);
/*
* Method: nextToken
* Usage: token = scanner.nextToken();
* -----------------------------------
* This method returns the next token from this scanner. If
* nextToken is called when no tokens are available, it returns the
* empty string.
*/
    string nextToken();
/*
* Method: hasMoreTokens
* Usage: if (scanner.hasMoreTokens()) . . .
* ------------------------------------------
* This method returns true as long as there are additional
* tokens for this scanner to read.
*/
    bool hasMoreTokens();
/*
* Methods: setSpaceOption, getSpaceOption
* Usage: scanner.setSpaceOption(option);
* option = scanner.getSpaceOption();
* ------------------------------------------
* This method controls whether this scanner
* ignores whitespace characters or treats them as valid tokens.
* By default, the nextToken function treats whitespace characters,
* such as spaces and tabs, just like any other punctuation mark.
* If, however, you call
*
* scanner.setSpaceOption(Scanner::IgnoreSpaces);
*
* the scanner will skip over any white space before reading a
* token. You can restore the original behavior by calling
*
* scanner.setSpaceOption(Scanner::PreserveSpaces);
*
* The getSpaceOption function returns the current setting
* of this option.
*/
    enum spaceOptionT { PreserveSpaces, IgnoreSpaces };
    void setSpaceOption(spaceOptionT option);
    spaceOptionT getSpaceOption();

/*
 * Methods: setStringOption, getStringOption
 * Usage: scanner.setStringOption(option);
 *        option = scanner.getStringOption();
 * --------------------------------------------------
 * This method controls how the scanner reads double quotation marks 
 * as input.  The default is set to treat quotes just like any other 
 * punctuation character: 
 *    scanner.setStringOption(Scanner::ScanQuotesAsPunctuation);
 * 
 * Otherwise, the option:
 *    scanner.setStringOption(Scanner::ScanQuotesAsStrings);
 *
 * the token starting with a quotation mark will be scanned until
 * another quotation mark is found (closing quotation). Therefore
 * the entire string within the quotation, including both quotation
 * marks counts as 1 token.
 */
    enum stringOptionT { ScanQuotesAsPunctuation, ScanQuotesAsStrings };

    void setStringOption(stringOptionT option);
    stringOptionT getStringOption();


private:

#include "scanpriv.h"
};
#endif

** и, наконец, scanpriv.h **

/*
* File: scanpriv.h
* ----------------
* This file contains the private data for the simplified version
* of the Scanner class.
*/

/* Instance variables */
string buffer; /* The string containing the tokens */
int len; /* The buffer length, for efficiency */
int cp; /* The current index in the buffer */
spaceOptionT spaceOption; /* Setting of the space option */
stringOptionT stringOption;

/* Private method prototypes */
void skipSpaces();
int scanToEndOfIdentifier();
void scanQuotedString();

Ответы [ 2 ]

3 голосов
/ 05 декабря 2011

Чтобы долго читать.

Два способа разбора цитируемого текста:

0) Штат

Простой переключатель, который сообщает, находитесь ли вы в кавычках прямо сейчас, и который активирует некоторую специальную обработку предложений В основном это будет эквивалентно # 1), просто в строке.

1) Подправило в сканере рекурсивного спуска

Уберите состояние и напишите отдельное правило для сканирования цитируемого текста. Код на самом деле будет довольно простым (p-код, вдохновленный C ++):

// assume we are one behind the opening quotation mark
for (c : text) {
    if (is_escape (*c)) {  // to support stuff like "foo's name is \"bar\""
        p = peek(c);
        if (!is_valid_escape_character (peek (c))) error;
        else {
            make the peeked character (*p) part of the result;
            ++c;
        }
    }
    else if (is_quotation_mark (*c))
    {
        return the result; // we approached the end of the string
    }
    else if (!is_valid_character (*c))
    {
        error; // maybe you want to forbid literal control characters
    }
    else
    {
        make *c part of the result
    }
}
error; // reached end of input before closing quotation mark

Если вы не хотите поддерживать экранирующие символы, код становится проще:

// assume we are one behind the opening quotation mark
for (c : text) {
    if (is_quotation_mark (*c))
        return the result;
    else if (!is_valid_character (*c))
        error;
    else
        make *c part of the result
}
error; // reached end of input before closing quotation mark

Не следует опускать проверку, является ли она недопустимым символом, так как это может побудить пользователей использовать ваш код и, возможно, использовать неопределенное поведение вашей программы.

0 голосов
/ 05 декабря 2011

Быстрый взгляд на код: если вы находитесь в режиме ScanQuotesAsStrings, вы не ожидаете никаких других токенов, кроме строк в кавычках; скорее разница должна быть в том, что когда вы видите токен, начинающийся с '"', вы переходите на отдельный субсканер.

В псевдокоде (с использованием языка C ++ "конечный итератор - идиома один за другим"):

current_token.begin = cursor;
current_token.end = current_token.begin + 1;
if(scan_quotes_as_strings && *current_token.begin == '"') {
    while(*current_token.end && *current_token.end != '"')
        ++current_token.end;
    return;
}
while(*current_token.end && *current_token.end != ' ')
    ++current_token.end;

Вы можете объединить эти два цикла в один, введя переменную состояния, а не выражая состояние сканера различными путями кода.

Кроме того,

while ((cp < len && (buffer[cp] == '"')) || (cp < len && (buffer[cp] == '"'))) ...

выглядит просто подозрительно.

...