00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
class CXMLElement;
00027
00047
#ifndef _CINVERTEDFILEACCESSOR
00048
#define _CINVERTEDFILEACCESSOR
00049
#include "libGIFTAcInvertedFile/include/uses-declarations.h"
00050
#include <string>
00051
#include "libMRML/include/TID.h"
00052
#include "libMRML/include/CSelfDestroyPointer.h"
00053
#include "libMRML/include/CArraySelfDestroyPointer.h"
00054
#include "libGIFTAcInvertedFile/include/CDocumentFrequencyList.h"
00055
#include "CCollectionFrequencyList.h"
00056
#include "libGIFTAcInvertedFile/include/CADIHash.h"
00057
#include "libGIFTAcURL2FTS/include/CAcURL2FTS.h"
00058
#include <iostream>
00059
#include <fstream>
00060
#include <map>
00061
#include <vector>
00062
#ifdef HAS_HASH_MAP
00063
#include <hash_map>
00064
#else
00065
#define hash_map map
00066
#endif
00067
#include <functional>
00068
#include <algorithm>
00069
00070
#include "libMRML/include/CMagic.h"
00071
00072
00073
typedef TID TFeatureID ;
00074
00081
class CAcInvertedFile:
public CAcURL2FTS{
00082
00083
protected:
00085 TID
mMaximumFeatureID;
00088 CArraySelfDestroyPointer<char>
mInvertedFileBuffer;
00090 mutable CSelfDestroyPointer<istream> mInvertedFile;
00091
00093 mutable ifstream
mOffsetFile;
00094
00096 ifstream
mFeatureDescriptionFile;
00097
00099 string
mInvertedFileName;
00100
00102 string
mOffsetFileName;
00103
00105 string
mFeatureDescriptionFileName;
00106
00108 typedef hash_map<TID,unsigned int>
CIDToOffset;
00110 CIDToOffset mIDToOffset;
00111
00113 mutable hash_map<TID,double>
mFeatureToCollectionFrequency;
00114
00118 hash_map<TID,unsigned int>
mFeatureDescription;
00119
00123 CADIHash mDocumentInformation;
00125
00128
void writeOffsetFileElement(TID inFeatureID,
00129
int inPosition,
00130 ostream& inOpenOffsetFile);
00132
CDocumentFrequencyList*
getFeatureFile(string inFileName)
const;
00133
public:
00135
bool operator()()const;
00136
00151
CAcInvertedFile(const
CXMLElement& inCollectionElement);
00153
bool init(
bool);
00154
00156 ~
CAcInvertedFile();
00157
00159 string IDToURL(TID inID)const;
00160
00162 TID URLToID(const string& inURL)const;
00163
00167
CDocumentFrequencyList* FeatureToList(TFeatureID)const;
00168
00170
CDocumentFrequencyList* URLToFeatureList(string inURL)const;
00171
00173
CDocumentFrequencyList* DIDToFeatureList(TID inDID)const;
00174
00176
00177
00181
double FeatureToCollectionFrequency(TFeatureID)const;
00182
00184
unsigned int getFeatureDescription(TID inFeatureID)const;
00186
00190
double DIDToMaxDocumentFrequency(TID)const;
00191
00193
double DIDToDFSquareSum(TID)const;
00194
00196
double DIDToSquareDFLogICFSum(TID)const;
00198
00199
00201
00209
bool generateInvertedFile();
00210
00218
bool newGenerateInvertedFile();
00219
00222
bool checkConsistency();
00223
00227
bool findWithinStream(TID inFeatureID,
00228 TID inDocumentID,
00229
double inDocumentFrequency)const;
00230
00232
00234 TID getMaximumFeatureID()const;
00242 list<TID>* getAllFeatureIDs()const;
00243 };
00244
00245 #endif