Lucene++ - a full-featured, c++ search engine
API Documentation


CharTokenizer.h
Go to the documentation of this file.
1
2// Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3// Distributable under the terms of either the Apache License (Version 2.0)
4// or the GNU Lesser General Public License.
6
7#ifndef CHARTOKENIZER_H
8#define CHARTOKENIZER_H
9
10#include "Tokenizer.h"
11
12namespace Lucene {
13
15class LPPAPI CharTokenizer : public Tokenizer {
16public:
17 CharTokenizer(const ReaderPtr& input);
18 CharTokenizer(const AttributeSourcePtr& source, const ReaderPtr& input);
19 CharTokenizer(const AttributeFactoryPtr& factory, const ReaderPtr& input);
20 virtual ~CharTokenizer();
21
23
24protected:
25 int32_t offset;
26 int32_t bufferIndex;
27 int32_t dataLen;
28
29 static const int32_t MAX_WORD_LEN;
30 static const int32_t IO_BUFFER_SIZE;
31
32 CharArray ioBuffer;
35
36public:
37 virtual bool incrementToken();
38 virtual void end();
39 virtual void reset(const ReaderPtr& input);
40
41protected:
45 virtual bool isTokenChar(wchar_t c) = 0;
46
49 virtual wchar_t normalize(wchar_t c);
50};
51
52}
53
54#endif
#define LUCENE_CLASS(Name)
Definition: LuceneObject.h:24
An abstract base class for simple, character-oriented tokenizers.
Definition: CharTokenizer.h:15
virtual void reset(const ReaderPtr &input)
Reset the tokenizer to a new reader. Typically, an analyzer (in its reusableTokenStream method) will ...
CharTokenizer(const AttributeSourcePtr &source, const ReaderPtr &input)
CharArray ioBuffer
Definition: CharTokenizer.h:32
OffsetAttributePtr offsetAtt
Definition: CharTokenizer.h:34
CharTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input)
virtual void end()
This method is called by the consumer after the last token has been consumed, after incrementToken() ...
static const int32_t IO_BUFFER_SIZE
Definition: CharTokenizer.h:30
virtual bool isTokenChar(wchar_t c)=0
Returns true if a character should be included in a token. This tokenizer generates as tokens adjacen...
TermAttributePtr termAtt
Definition: CharTokenizer.h:33
static const int32_t MAX_WORD_LEN
Definition: CharTokenizer.h:29
int32_t offset
Definition: CharTokenizer.h:25
int32_t dataLen
Definition: CharTokenizer.h:27
virtual wchar_t normalize(wchar_t c)
Called on each token character to normalize it before it is added to the token. The default implement...
virtual bool incrementToken()
Consumers (ie., IndexWriter) use this method to advance the stream to the next token....
CharTokenizer(const ReaderPtr &input)
int32_t bufferIndex
Definition: CharTokenizer.h:26
A Tokenizer is a TokenStream whose input is a Reader.
Definition: Tokenizer.h:20
Definition: AbstractAllTermDocs.h:12
boost::shared_ptr< AttributeSource > AttributeSourcePtr
Definition: LuceneTypes.h:520
boost::shared_ptr< TermAttribute > TermAttributePtr
Definition: LuceneTypes.h:58
boost::shared_ptr< OffsetAttribute > OffsetAttributePtr
Definition: LuceneTypes.h:40
boost::shared_ptr< Reader > ReaderPtr
Definition: LuceneTypes.h:547
boost::shared_ptr< AttributeFactory > AttributeFactoryPtr
Definition: LuceneTypes.h:519

clucene.sourceforge.net