Lucene++ - a full-featured, c++ search engine
API Documentation


 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
IndexWriter.h
Go to the documentation of this file.
1 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
6 
7 #ifndef INDEXWRITER_H
8 #define INDEXWRITER_H
9 
10 #include "MergePolicy.h"
11 
12 namespace Lucene {
13 
90 class LPPAPI IndexWriter : public LuceneObject {
91 protected:
92  IndexWriter(const DirectoryPtr& d, const AnalyzerPtr& a, bool create, const IndexDeletionPolicyPtr& deletionPolicy, int32_t mfl, const IndexingChainPtr& indexingChain, const IndexCommitPtr& commit);
93 
94 public:
95  IndexWriter(const DirectoryPtr& d, const AnalyzerPtr& a, bool create, int32_t mfl);
96  IndexWriter(const DirectoryPtr& d, const AnalyzerPtr& a, int32_t mfl);
97  IndexWriter(const DirectoryPtr& d, const AnalyzerPtr& a, const IndexDeletionPolicyPtr& deletionPolicy, int32_t mfl);
98  IndexWriter(const DirectoryPtr& d, const AnalyzerPtr& a, bool create, const IndexDeletionPolicyPtr& deletionPolicy, int32_t mfl);
99  IndexWriter(const DirectoryPtr& d, const AnalyzerPtr& a, const IndexDeletionPolicyPtr& deletionPolicy, int32_t mfl, const IndexCommitPtr& commit);
100  virtual ~IndexWriter();
101 
103 
104 protected:
105  int64_t writeLockTimeout;
106 
110  static const int32_t MERGE_READ_BUFFER_SIZE;
111 
113  static int32_t MESSAGE_ID;
114  int32_t messageID;
115  bool hitOOM;
116 
117  DirectoryPtr directory; // where this index resides
118  AnalyzerPtr analyzer; // how to analyze text
119 
120  bool create;
124 
125  SimilarityPtr similarity; // how to normalize
126 
127  int64_t changeCount; // increments every time a change is completed
128  int64_t lastCommitChangeCount; // last changeCount that was committed
129 
130  SegmentInfosPtr rollbackSegmentInfos; // segmentInfos we will fallback to if the commit fails
131  MapSegmentInfoInt rollbackSegments;
132 
133  SegmentInfosPtr localRollbackSegmentInfos; // segmentInfos we will fallback to if the commit fails
135 
136  SegmentInfosPtr segmentInfos; // the segments
137 
140 
141  SetSegmentInfo segmentsToOptimize; // used by optimize to note those needing optimization
143 
145 
147 
148  bool closed;
149  bool closing;
150 
151  SetSegmentInfo mergingSegments;
155  SetOneMerge runningMerges;
157  int64_t mergeGen;
159 
160  int32_t flushCount;
162 
164  int32_t readCount; // count of how many threads are holding read lock
165  int64_t writeThread; // non-null if any thread holds write lock
166  int32_t upgradeCount;
167 
169 
170  // This is a "write once" variable (like the organic dye on a DVD-R that may or may not
171  // be heated by a laser and then cooled to permanently record the event): it's false,
172  // until getReader() is called for the first time, at which point it's switched to true
173  // and never changes back to false. Once this is true, we hold open and reuse SegmentReader
174  // instances internally for applying deletes, doing merges, and reopening near real-time readers.
176 
187  int32_t maxFieldLength;
188 
191 
192  HashSet<String> synced; // files that have been sync'd already
193  HashSet<String> syncing; // files that are now being sync'd
194 
196 
199 
200 INTERNAL:
201  SegmentInfosPtr pendingCommit; // set when a commit is pending (after prepareCommit() & before commit())
203 
205 
206 public:
209  static int64_t WRITE_LOCK_TIMEOUT;
210 
211  static const String WRITE_LOCK_NAME;
212 
214  static const int32_t DISABLE_AUTO_FLUSH;
215 
218  static const int32_t DEFAULT_MAX_BUFFERED_DOCS;
219 
222  static const double DEFAULT_RAM_BUFFER_SIZE_MB;
223 
226  static const int32_t DEFAULT_MAX_BUFFERED_DELETE_TERMS;
227 
229  static const int32_t DEFAULT_MAX_FIELD_LENGTH;
230 
232  static const int32_t DEFAULT_TERM_INDEX_INTERVAL;
233 
237  static int32_t MAX_TERM_LENGTH();
238 
240  static const int32_t MaxFieldLengthUNLIMITED;
241 
243  static const int32_t MaxFieldLengthLIMITED;
244 
245 public:
246  virtual void initialize();
247 
285  virtual IndexReaderPtr getReader();
286 
296  virtual IndexReaderPtr getReader(int32_t termInfosIndexDivisor);
297 
300  virtual int32_t numDeletedDocs(const SegmentInfoPtr& info);
301 
302  virtual void acquireWrite();
303  virtual void releaseWrite();
304  virtual void acquireRead();
305 
308  virtual void upgradeReadToWrite();
309 
310  virtual void releaseRead();
311  virtual bool isOpen(bool includePendingClose);
312  virtual void message(const String& message);
313 
322  virtual bool getUseCompoundFile();
323 
330  virtual void setUseCompoundFile(bool value);
331 
333  virtual void setSimilarity(const SimilarityPtr& similarity);
334 
337  virtual SimilarityPtr getSimilarity();
338 
355  virtual void setTermIndexInterval(int32_t interval);
356 
359  virtual int32_t getTermIndexInterval();
360 
362  virtual void setMergePolicy(const MergePolicyPtr& mp);
363 
366  virtual MergePolicyPtr getMergePolicy();
367 
369  virtual void setMergeScheduler(const MergeSchedulerPtr& mergeScheduler);
370 
373  virtual MergeSchedulerPtr getMergeScheduler();
374 
388  virtual void setMaxMergeDocs(int32_t maxMergeDocs);
389 
398  virtual int32_t getMaxMergeDocs();
399 
409  virtual void setMaxFieldLength(int32_t maxFieldLength);
410 
413  virtual int32_t getMaxFieldLength();
414 
418  virtual void setReaderTermsIndexDivisor(int32_t divisor);
419 
421  virtual int32_t getReaderTermsIndexDivisor();
422 
434  virtual void setMaxBufferedDocs(int32_t maxBufferedDocs);
435 
438  virtual int32_t getMaxBufferedDocs();
439 
461  virtual void setRAMBufferSizeMB(double mb);
462 
464  virtual double getRAMBufferSizeMB();
465 
472  virtual void setMaxBufferedDeleteTerms(int32_t maxBufferedDeleteTerms);
473 
476  virtual int32_t getMaxBufferedDeleteTerms();
477 
487  virtual void setMergeFactor(int32_t mergeFactor);
488 
496  virtual int32_t getMergeFactor();
497 
500  static void setDefaultInfoStream(const InfoStreamPtr& infoStream);
501 
504  static InfoStreamPtr getDefaultInfoStream();
505 
508  virtual void setInfoStream(const InfoStreamPtr& infoStream);
509 
512  virtual InfoStreamPtr getInfoStream();
513 
515  virtual bool verbose();
516 
520  virtual void setWriteLockTimeout(int64_t writeLockTimeout);
521 
524  virtual int64_t getWriteLockTimeout();
525 
528  static void setDefaultWriteLockTimeout(int64_t writeLockTimeout);
529 
532  static int64_t getDefaultWriteLockTimeout();
533 
547  virtual void close();
548 
561  virtual void close(bool waitForMerges);
562 
564  virtual DirectoryPtr getDirectory();
565 
567  virtual AnalyzerPtr getAnalyzer();
568 
572  virtual int32_t maxDoc();
573 
578  virtual int32_t numDocs();
579 
580  virtual bool hasDeletions();
581 
603  virtual void addDocument(const DocumentPtr& doc);
604 
613  virtual void addDocument(const DocumentPtr& doc, const AnalyzerPtr& analyzer);
614 
620  virtual void deleteDocuments(const TermPtr& term);
621 
627  virtual void deleteDocuments(Collection<TermPtr> terms);
628 
634  virtual void deleteDocuments(const QueryPtr& query);
635 
642  virtual void deleteDocuments(Collection<QueryPtr> queries);
643 
652  virtual void updateDocument(const TermPtr& term, const DocumentPtr& doc);
653 
663  virtual void updateDocument(const TermPtr& term, const DocumentPtr& doc, const AnalyzerPtr& analyzer);
664 
665  virtual int32_t getSegmentCount();
666  virtual int32_t getNumBufferedDocuments();
667  virtual int32_t getDocCount(int32_t i);
668  virtual int32_t getFlushCount();
669  virtual int32_t getFlushDeletesCount();
670 
671  virtual String newSegmentName();
672 
707  virtual void optimize();
708 
715  virtual void optimize(int32_t maxNumSegments);
716 
722  virtual void optimize(bool doWait);
723 
729  virtual void optimize(int32_t maxNumSegments, bool doWait);
730 
736  virtual void expungeDeletes(bool doWait);
737 
749  virtual void expungeDeletes();
750 
759  virtual void maybeMerge();
760 
763  virtual OneMergePtr getNextMerge();
764 
770  virtual void rollback();
771 
783  virtual void deleteAll();
784 
789  virtual void waitForMerges();
790 
821  virtual void addIndexesNoOptimize(Collection<DirectoryPtr> dirs);
822 
833  virtual void addIndexes(Collection<IndexReaderPtr> readers);
834 
839  virtual void prepareCommit();
840 
856  // will only "stick" if there are actually changes in the index to commit.
857  virtual void prepareCommit(MapStringString commitUserData);
858 
876  virtual void commit();
877 
883  virtual void commit(MapStringString commitUserData);
884 
887  virtual int64_t ramSizeInBytes();
888 
890  virtual int32_t numRamDocs();
891 
893  virtual void merge(const OneMergePtr& merge);
894 
896  virtual void mergeSuccess(const OneMergePtr& merge);
897 
901  virtual bool registerMerge(const OneMergePtr& merge);
902 
905  virtual void mergeInit(const OneMergePtr& merge);
906 
908  virtual void mergeFinish(const OneMergePtr& merge);
909 
910  virtual void addMergeException(const OneMergePtr& merge);
911 
913  virtual int32_t getBufferedDeleteTermsSize();
914 
916  virtual int32_t getNumBufferedDeleteTerms();
917 
919  virtual SegmentInfoPtr newestSegment();
920 
921  virtual String segString();
922 
925  static bool isLocked(const DirectoryPtr& directory);
926 
930  static void unlock(const DirectoryPtr& directory);
931 
933  virtual void setMergedSegmentWarmer(const IndexReaderWarmerPtr& warmer);
934 
936  virtual IndexReaderWarmerPtr getMergedSegmentWarmer();
937 
951  virtual bool testPoint(const String& name);
952 
953  virtual bool nrtIsCurrent(const SegmentInfosPtr& infos);
954  virtual bool isClosed();
955 
956 protected:
957  virtual void ensureOpen(bool includePendingClose);
958  virtual void ensureOpen();
959  virtual void setMessageID(const InfoStreamPtr& infoStream);
960 
963  virtual LogMergePolicyPtr getLogMergePolicy();
964 
965  virtual void setRollbackSegmentInfos(const SegmentInfosPtr& infos);
966 
969  virtual void pushMaxBufferedDocs();
970 
971  virtual void messageState();
972 
975  virtual bool shouldClose();
976  virtual void closeInternal(bool waitForMerges);
977 
980  virtual bool flushDocStores();
981 
983  virtual bool optimizeMergesPending();
984 
985  virtual void maybeMerge(bool optimize);
986  virtual void maybeMerge(int32_t maxNumSegmentsOptimize, bool optimize);
987  virtual void updatePendingMerges(int32_t maxNumSegmentsOptimize, bool optimize);
988 
990  virtual OneMergePtr getNextExternalMerge();
991 
999  virtual void startTransaction(bool haveReadLock);
1000 
1002  virtual void rollbackTransaction();
1003 
1006  virtual void commitTransaction();
1007  virtual void rollbackInternal();
1008 
1009  virtual void finishMerges(bool waitForMerges);
1010 
1013  virtual void checkpoint();
1014 
1015  virtual void finishAddIndexes();
1016  virtual void blockAddIndexes(bool includePendingClose);
1017  virtual void resumeAddIndexes();
1018  virtual void resetMergeExceptions();
1019  virtual void noDupDirs(Collection<DirectoryPtr> dirs);
1020 
1021  virtual bool hasExternalSegments();
1022 
1027  virtual void resolveExternalSegments();
1028 
1031  virtual void doAfterFlush();
1032 
1035  virtual void doBeforeFlush();
1036 
1037  virtual void commit(int64_t sizeInBytes);
1038  virtual void finishCommit();
1039 
1044  virtual void flush(bool triggerMerge, bool flushDocStores, bool flushDeletes);
1045  virtual bool doFlush(bool flushDocStores, bool flushDeletes);
1046  virtual bool doFlushInternal(bool flushDocStores, bool flushDeletes);
1047 
1048  virtual int32_t ensureContiguousMerge(const OneMergePtr& merge);
1049 
1055  virtual void commitMergedDeletes(const OneMergePtr& merge, const SegmentReaderPtr& mergeReader);
1056  virtual bool commitMerge(const OneMergePtr& merge, const SegmentMergerPtr& merger, int32_t mergedDocCount, const SegmentReaderPtr& mergedReader);
1057 
1058  virtual LuceneException handleMergeException(const LuceneException& exc, const OneMergePtr& merge);
1059 
1060  virtual void _mergeInit(const OneMergePtr& merge);
1061 
1062  virtual void setDiagnostics(const SegmentInfoPtr& info, const String& source);
1063  virtual void setDiagnostics(const SegmentInfoPtr& info, const String& source, MapStringString details);
1064 
1065  virtual void setMergeDocStoreIsCompoundFile(const OneMergePtr& merge);
1066  virtual void closeMergeReaders(const OneMergePtr& merge, bool suppressExceptions);
1067 
1070  virtual int32_t mergeMiddle(const OneMergePtr& merge);
1071 
1073  virtual bool applyDeletes();
1074 
1075  virtual String segString(const SegmentInfosPtr& infos);
1076 
1077  virtual bool startSync(const String& fileName, HashSet<String> pending);
1078  virtual void finishSync(const String& fileName, bool success);
1079 
1081  bool waitForAllSynced(HashSet<String> syncing);
1082  void doWait();
1083 
1087  virtual void startCommit(int64_t sizeInBytes, MapStringString commitUserData);
1088 
1089  virtual LuceneException handleOOM(const std::bad_alloc& oom, const String& location);
1090 
1091  friend class ReaderPool;
1092 };
1093 
1100 class LPPAPI IndexReaderWarmer : public LuceneObject {
1101 public:
1102  virtual ~IndexReaderWarmer();
1103 
1105 
1106 public:
1107  virtual void warm(const IndexReaderPtr& reader) = 0;
1108 };
1109 
1110 }
1111 
1112 #endif
static const int32_t DEFAULT_MAX_BUFFERED_DOCS
Disabled by default (because IndexWriter flushes by RAM usage by default). Change using setMaxBuffere...
Definition: IndexWriter.h:218
An IndexWriter creates and maintains an index.
Definition: IndexWriter.h:90
static const String WRITE_LOCK_NAME
Definition: IndexWriter.h:211
static int64_t WRITE_LOCK_TIMEOUT
Default value for the write lock timeout (1,000).
Definition: IndexWriter.h:209
static const int32_t MaxFieldLengthLIMITED
Sets the maximum field length to DEFAULT_MAX_FIELD_LENGTH.
Definition: IndexWriter.h:243
int32_t termIndexInterval
Definition: IndexWriter.h:146
static const int32_t MaxFieldLengthUNLIMITED
Sets the maximum field length to INT_MAX.
Definition: IndexWriter.h:240
SetSegmentInfo segmentsToOptimize
Definition: IndexWriter.h:141
SetOneMerge runningMerges
Definition: IndexWriter.h:155
boost::shared_ptr< InfoStream > InfoStreamPtr
Definition: LuceneTypes.h:532
static const int32_t DISABLE_AUTO_FLUSH
Value to denote a flush trigger is disabled.
Definition: IndexWriter.h:214
boost::shared_ptr< Term > TermPtr
Definition: LuceneTypes.h:233
boost::shared_ptr< Analyzer > AnalyzerPtr
Definition: LuceneTypes.h:20
bool closing
Definition: IndexWriter.h:149
boost::shared_ptr< IndexFileDeleter > IndexFileDeleterPtr
Definition: LuceneTypes.h:154
boost::shared_ptr< OneMerge > OneMergePtr
Definition: LuceneTypes.h:192
boost::shared_ptr< MergePolicy > MergePolicyPtr
Definition: LuceneTypes.h:174
bool hitOOM
Definition: IndexWriter.h:115
bool stopMerges
Definition: IndexWriter.h:158
int32_t optimizeMaxNumSegments
Definition: IndexWriter.h:142
Collection< OneMergePtr > mergeExceptions
Definition: IndexWriter.h:156
boost::shared_ptr< IndexReaderWarmer > IndexReaderWarmerPtr
Definition: LuceneTypes.h:158
MapSegmentInfoInt rollbackSegments
Definition: IndexWriter.h:131
IndexReaderWarmerPtr mergedSegmentWarmer
Definition: IndexWriter.h:195
int64_t pendingCommitChangeCount
Definition: IndexWriter.h:202
int64_t changeCount
Definition: IndexWriter.h:127
int32_t maxFieldLength
The maximum number of terms that will be indexed for a single field in a document. This limits the amount of memory required for indexing, so that collections with very large files will not crash the indexing process by running out of memory. Note that this effectively truncates large documents, excluding from the index terms that occur further in the document. If you know your source documents are large, be sure to set this value high enough to accommodate the expected size. If you set it to INT_MAX, then the only limit is your memory, but you should anticipate an std::bad_alloc. By default, no more than 10,000 terms will be indexed for a field.
Definition: IndexWriter.h:187
SetSegmentInfo mergingSegments
Definition: IndexWriter.h:151
int64_t lastCommitChangeCount
Definition: IndexWriter.h:128
int32_t messageID
Definition: IndexWriter.h:114
bool closed
Definition: IndexWriter.h:148
IndexFileDeleterPtr deleter
Definition: IndexWriter.h:139
SynchronizePtr commitLock
Used only by commit; lock order is commitLock -> IW.
Definition: IndexWriter.h:198
DocumentsWriterPtr docWriter
Definition: IndexWriter.h:138
boost::shared_ptr< MergeScheduler > MergeSchedulerPtr
Definition: LuceneTypes.h:175
static const int32_t MERGE_READ_BUFFER_SIZE
The normal read buffer size defaults to 1024, but increasing this during merging seems to yield perfo...
Definition: IndexWriter.h:110
IndexCommitPtr indexCommit
Definition: IndexWriter.h:123
boost::shared_ptr< SegmentInfo > SegmentInfoPtr
Definition: LuceneTypes.h:208
boost::shared_ptr< LogMergePolicy > LogMergePolicyPtr
Definition: LuceneTypes.h:172
int64_t writeLockTimeout
Definition: IndexWriter.h:102
static const double DEFAULT_RAM_BUFFER_SIZE_MB
Default value is 16 MB (which means flush when buffered docs consume 16 MB RAM). Change using setRAMB...
Definition: IndexWriter.h:222
boost::shared_ptr< Lock > LockPtr
Definition: LuceneTypes.h:496
SynchronizePtr messageIDLock
Definition: IndexWriter.h:112
boost::shared_ptr< SegmentInfos > SegmentInfosPtr
Definition: LuceneTypes.h:210
LockPtr writeLock
Definition: IndexWriter.h:144
boost::shared_ptr< DocumentsWriter > DocumentsWriterPtr
Definition: LuceneTypes.h:123
int32_t flushCount
Definition: IndexWriter.h:160
boost::shared_ptr< Document > DocumentPtr
Definition: LuceneTypes.h:74
boost::shared_ptr< IndexDeletionPolicy > IndexDeletionPolicyPtr
Definition: LuceneTypes.h:153
boost::shared_ptr< Query > QueryPtr
Definition: LuceneTypes.h:420
int64_t mergeGen
Definition: IndexWriter.h:157
int32_t localFlushedDocCount
Definition: IndexWriter.h:134
ReaderPoolPtr readerPool
Definition: IndexWriter.h:204
HashSet< String > syncing
Definition: IndexWriter.h:193
If getReader has been called (ie, this writer is in near real-time mode), then after a merge complete...
Definition: IndexWriter.h:1100
boost::shared_ptr< IndexReader > IndexReaderPtr
Definition: LuceneTypes.h:157
boost::shared_ptr< Directory > DirectoryPtr
Definition: LuceneTypes.h:489
Base class for all Lucene classes.
Definition: LuceneObject.h:31
AnalyzerPtr analyzer
Definition: IndexWriter.h:118
HashSet< String > synced
Definition: IndexWriter.h:192
#define LUCENE_CLASS(Name)
Definition: LuceneObject.h:24
int64_t writeThread
Definition: IndexWriter.h:165
Definition: AbstractAllTermDocs.h:12
Lucene exception container.
Definition: LuceneException.h:15
boost::shared_ptr< IndexCommit > IndexCommitPtr
Definition: LuceneTypes.h:152
static int32_t MESSAGE_ID
Definition: IndexWriter.h:113
bool poolReaders
Definition: IndexWriter.h:175
boost::shared_ptr< SegmentReader > SegmentReaderPtr
Definition: LuceneTypes.h:215
static const int32_t DEFAULT_TERM_INDEX_INTERVAL
Default value is 128. Change using setTermIndexInterval(int32_t).
Definition: IndexWriter.h:232
SegmentInfosPtr rollbackSegmentInfos
Definition: IndexWriter.h:130
InfoStreamPtr infoStream
Definition: IndexWriter.h:189
SegmentInfosPtr segmentInfos
Definition: IndexWriter.h:136
int32_t flushDeletesCount
Definition: IndexWriter.h:161
DirectoryPtr directory
Definition: IndexWriter.h:117
Collection< OneMergePtr > pendingMerges
Definition: IndexWriter.h:154
boost::shared_ptr< ReaderPool > ReaderPoolPtr
Definition: LuceneTypes.h:203
bool create
Definition: IndexWriter.h:120
IndexingChainPtr indexingChain
Definition: IndexWriter.h:122
static const int32_t DEFAULT_MAX_BUFFERED_DELETE_TERMS
Disabled by default (because IndexWriter flushes by RAM usage by default). Change using setMaxBuffere...
Definition: IndexWriter.h:226
boost::shared_ptr< IndexingChain > IndexingChainPtr
Definition: LuceneTypes.h:156
boost::shared_ptr< Synchronize > SynchronizePtr
Definition: LuceneTypes.h:552
boost::shared_ptr< Similarity > SimilarityPtr
Definition: LuceneTypes.h:435
SimilarityPtr similarity
Definition: IndexWriter.h:125
MergePolicyPtr mergePolicy
Definition: IndexWriter.h:152
static const int32_t DEFAULT_MAX_FIELD_LENGTH
Default value is 10,000. Change using setMaxFieldLength(int32_t).
Definition: IndexWriter.h:229
int32_t readCount
Used to only allow one addIndexes to proceed at once.
Definition: IndexWriter.h:164
boost::shared_ptr< SegmentMerger > SegmentMergerPtr
Definition: LuceneTypes.h:214
int32_t readerTermsIndexDivisor
Definition: IndexWriter.h:168
int32_t upgradeCount
Definition: IndexWriter.h:166
IndexDeletionPolicyPtr deletionPolicy
Definition: IndexWriter.h:121
static InfoStreamPtr defaultInfoStream
Definition: IndexWriter.h:190
MergeSchedulerPtr mergeScheduler
Definition: IndexWriter.h:153
SegmentInfosPtr localRollbackSegmentInfos
Definition: IndexWriter.h:133

clucene.sourceforge.net