00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
class CXMLElement;
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
#ifndef _CACIFFILESYSTEM
00051
#define _CACIFFILESYSTEM
00052
#include "libGIFTAcInvertedFile/include/uses-declarations.h"
00053
#include <string>
00054
#include "libMRML/include/TID.h"
00055
#include "libMRML/include/CSelfDestroyPointer.h"
00056
#include "libMRML/include/CArraySelfDestroyPointer.h"
00057
#include "libGIFTAcInvertedFile/include/CDocumentFrequencyList.h"
00058
#include "libMRML/include/CMutex.h"
00059
00060
#include "libGIFTAcInvertedFile/include/CADIHash.h"
00061
#include "libGIFTAcURL2FTS/include/CAcURL2FTS.h"
00062
#include "libGIFTAcInvertedFile/include/CAcInvertedFile.h"
00063
#include <iostream>
00064
#include <fstream>
00065
#include <map>
00066
#include <vector>
00067
#ifdef HAS_HASH_MAP
00068
#include <hash_map>
00069
#define HASH_MAP hash_map
00070
#else
00071
#define HASH_MAP map
00072
#endif
00073
#include <functional>
00074
#include <algorithm>
00075
00076
#include "libMRML/include/CMagic.h"
00077
00078
00079
typedef TID TFeatureID ;
00080
00091 class CAcIFFileSystem:
public CAcInvertedFile{
00092
00093
protected:
00095 CMutex mMutex;
00101 CSelfDestroyPointer<CAcURL2FTS> mURL2FTS;
00103 TID
mMaximumFeatureID;
00106
#ifndef V295
00107 string
mInvertedFileBuffer;
00108
#else
00109
CArraySelfDestroyPointer<char>
mInvertedFileBuffer;
00110
#endif
00111
00113 string
mTemporaryIndexingFileBase;
00115 mutable CSelfDestroyPointer<istream> mInvertedFile;
00116
00118 mutable ifstream
mOffsetFile;
00119
00121 ifstream
mFeatureDescriptionFile;
00122
00124 string
mInvertedFileName;
00125
00127 string
mOffsetFileName;
00128
00130 string
mFeatureDescriptionFileName;
00131
00133 typedef HASH_MAP<TID,streampos>
CIDToOffset;
00135 CIDToOffset mIDToOffset;
00136
00138 mutable HASH_MAP<TID,double>
mFeatureToCollectionFrequency;
00139
00143 HASH_MAP<TID,unsigned int>
mFeatureDescription;
00144
00148 CADIHash mDocumentInformation;
00150
00153
void writeOffsetFileElement(TID inFeatureID,
00154 streampos inPosition,
00155 ostream& inOpenOffsetFile);
00157
CDocumentFrequencyList*
getFeatureFile(string inFileName)
const;
00158
public:
00160
bool operator()()const;
00161
00188
CAcIFFileSystem(const
CXMLElement& inCollectionElement);
00190
bool init(
bool);
00191
00193 ~
CAcIFFileSystem();
00194
00196 string IDToURL(TID inID)const;
00197
00201
CDocumentFrequencyList* FeatureToList(TFeatureID)const;
00202
00204
CDocumentFrequencyList* URLToFeatureList(string inURL)const;
00205
00207
CDocumentFrequencyList* DIDToFeatureList(TID inDID)const;
00208
00210
00211
00215
double FeatureToCollectionFrequency(TFeatureID)const;
00216
00218
unsigned int getFeatureDescription(TID inFeatureID)const;
00220
00224
double DIDToMaxDocumentFrequency(TID)const;
00225
00227
double DIDToDFSquareSum(TID)const;
00228
00230
double DIDToSquareDFLogICFSum(TID)const;
00232
00233
00235
00243
bool generateInvertedFile();
00244
00252
bool newGenerateInvertedFile();
00253
00256
bool checkConsistency();
00257
00264
bool findWithinStream(TID inFeatureID,
00265 TID inDocumentID,
00266
double inDocumentFrequency)const;
00267
00269
00275 virtual pair<
bool,TID> URLToID(const string& inURL)const;
00276
00278
void getAllIDs(list<TID>&)const;
00281
void getAllAccessorElements(list<
CAccessorElement>&)const;
00286
void getRandomIDs(list<TID>&,
00287 list<TID>::size_type)const;
00296
void getRandomAccessorElements(list<
CAccessorElement>& outResult,
00297 list<
CAccessorElement>::size_type inSize)const;
00299
int size()const;
00301
00302 TID getMaximumFeatureID()const;
00310 list<TID>* getAllFeatureIDs()const;
00316 virtual pair<
bool,
CAccessorElement> IDToAccessorElement(TID inID)const;
00318 operator
bool()const;
00319
00320 };
00321
00322 #endif