Writing a very simple lexical analyser in C++

This is a quick and dirty solution iterating on each pattern, and for each pattern trying to match the entire string, then iterating over matches and storing each match with its position in a map. The map implicitly sorts the matches by key (position) for you, so then you just need to iterate the map to get the matches in positional order, regardless of their pattern name.

#include <iterator>
#include <iostream>
#include <string>
#include <regex>
#include <list>
#include <map>

using namespace std;

int main(){

    string str = " hello how are 2 * 3 you? 123 4567867*98";

    // define list of patterns
    map<string,string> patterns {
        { "[0-9]+" ,   "NUMBERS" },
        { "[a-z]+" ,   "IDENTIFIERS" },
        { "\\*|\\+",  "OPERATORS" }
    };

    // storage for results
    map< size_t, pair<string,string> > matches;

    for ( auto pat = patterns.begin(); pat != patterns.end(); ++pat )
    {
        regex r(pat->first);
        auto words_begin = sregex_iterator( str.begin(), str.end(), r );
        auto words_end   = sregex_iterator();

        for ( auto it = words_begin; it != words_end; ++it )
            matches[ it->position() ] = make_pair( it->str(), pat->second );
    }

    for ( auto match = matches.begin(); match != matches.end(); ++match )
        cout<< match->second.first << " " << match->second.second << endl;
}

Output:

hello IDENTIFIERS
how IDENTIFIERS
are IDENTIFIERS
2 NUMBERS
* OPERATORS
3 NUMBERS
you IDENTIFIERS
123 NUMBERS
4567867 NUMBERS
* OPERATORS
98 NUMBERS

Leave a Comment