0.08.01
C++ Open Travel Request Parsing Library
Toggle main menu visibility
Loading...
Searching...
No Matches
Filter.cpp
Go to the documentation of this file.
1
// //////////////////////////////////////////////////////////////////////
2
// Import section
3
// //////////////////////////////////////////////////////////////////////
4
// STL
5
#include <cassert>
6
#include <sstream>
7
// OpenTrep
8
#include <
opentrep/basic/BasConst_General.hpp
>
9
#include <
opentrep/basic/Utilities.hpp
>
10
#include <
opentrep/bom/Filter.hpp
>
11
#include <
opentrep/service/Logger.hpp
>
12
13
namespace
OPENTREP
{
14
15
// //////////////////////////////////////////////////////////////////////
16
Filter::Filter() {
17
assert (
false
);
18
}
19
20
// //////////////////////////////////////////////////////////////////////
21
Filter::Filter (
const
Filter
& iFilter) {
22
assert (
false
);
23
}
24
25
// //////////////////////////////////////////////////////////////////////
26
Filter::~Filter() {
27
}
28
29
38
// //////////////////////////////////////////////////////////////////////
39
bool
hasGoodSize
(
const
std::string& iWord,
const
NbOfLetters_T
& iMinWordLength) {
40
bool
hasGoodSizeFlag =
true
;
41
//
42
const
size_t
lWordLength = iWord.size();
43
if
(lWordLength < iMinWordLength) {
44
hasGoodSizeFlag =
false
;
45
}
46
return
hasGoodSizeFlag;
47
}
48
52
// //////////////////////////////////////////////////////////////////////
53
bool
isBlackListed
(
const
std::string& iWord) {
54
// When the word is part of the "black list", it should obviously be
55
// filtered out.
56
BlackList_T::const_iterator itWord =
K_BLACK_LIST
.find (iWord);
57
const
bool
isBlackListedFlag = (itWord !=
K_BLACK_LIST
.end());
58
59
// DEBUG
60
// const std::string areEqualStr = (isBlackListedFlag)?"Yes":"No";
61
// const std::string& lWord = *itWord;
62
// OPENTREP_LOG_DEBUG ("Word: '" << iWord << "', black-list word: '"
63
// << lWord << "', Equals: " << areEqualStr);
64
65
return
isBlackListedFlag;
66
}
67
71
// //////////////////////////////////////////////////////////////////////
72
void
rtrim
(
WordList_T
& ioWordList,
const
NbOfLetters_T
& iMinWordLength) {
73
// If the list is empty, obviously nothing can be done at that stage.
74
if
(ioWordList.empty() ==
true
) {
75
return
;
76
}
77
78
// Take the first right outer word
79
WordList_T::reverse_iterator itWord = ioWordList.rbegin();
80
assert (itWord != ioWordList.rend());
81
const
std::string& lWord = *itWord;
82
83
// Check whether that word has the good size (>= iMinWordLength) and whether it is
84
// black-listed.
85
const
bool
hasGoodSizeFlag =
hasGoodSize
(lWord, iMinWordLength);
86
const
bool
isBlackListedFlag =
isBlackListed
(lWord);
87
if
(hasGoodSizeFlag ==
false
|| isBlackListedFlag ==
true
) {
88
ioWordList.erase (--itWord.base());
89
rtrim
(ioWordList, iMinWordLength);
90
}
91
}
92
96
// //////////////////////////////////////////////////////////////////////
97
void
ltrim
(
WordList_T
& ioWordList,
const
NbOfLetters_T
& iMinWordLength) {
98
// If the list is empty, obviously nothing can be done at that stage.
99
if
(ioWordList.empty() ==
true
) {
100
return
;
101
}
102
103
// Take the first left outer word
104
WordList_T::iterator itWord = ioWordList.begin();
105
assert (itWord != ioWordList.end());
106
const
std::string& lWord = *itWord;
107
108
// Check whether that word has the good size (>= iMinWordLength) and whether it is
109
// black-listed.
110
const
bool
hasGoodSizeFlag =
hasGoodSize
(lWord, iMinWordLength);
111
const
bool
isBlackListedFlag =
isBlackListed
(lWord);
112
if
(hasGoodSizeFlag ==
false
|| isBlackListedFlag ==
true
) {
113
ioWordList.erase (itWord);
114
ltrim
(ioWordList, iMinWordLength);
115
}
116
}
117
121
// //////////////////////////////////////////////////////////////////////
122
void
trim
(
WordList_T
& ioWordList,
const
NbOfLetters_T
& iMinWordLength) {
123
// Trim the non-relevant left outer words
124
ltrim
(ioWordList, iMinWordLength);
125
126
// Trim the non-relevant right outer words
127
rtrim
(ioWordList, iMinWordLength);
128
}
129
130
// //////////////////////////////////////////////////////////////////////
131
void
Filter::trim
(std::string& ioPhrase,
const
NbOfLetters_T
& iMinWordLength) {
132
// Create a list of words from the given phrase
133
WordList_T
lWordList;
134
tokeniseStringIntoWordList
(ioPhrase, lWordList);
135
136
// Trim the non-relevant left and right outer words
137
OPENTREP::trim
(lWordList, iMinWordLength);
138
139
// Re-create the phrase from the (potentially altered) list of words
140
ioPhrase =
createStringFromWordList
(lWordList);
141
}
142
143
// //////////////////////////////////////////////////////////////////////
144
bool
Filter::shouldKeep
(
const
std::string& iPhrase,
145
const
std::string& iWord) {
146
bool
isToBeKept =
true
;
147
148
// If both the phrase and the word are empty, the word should obviously
149
// be filtered out.
150
if
(iPhrase.empty() ==
true
&& iWord.empty() ==
true
) {
151
isToBeKept =
false
;
152
return
isToBeKept;
153
}
154
155
// If the term to be added is equal to the whole phrase (e.g., 'san'),
156
// it should be kept (not filtered out). Indeed, three-letter words
157
// often correspond to IATA codes, and should obviously be kept for
158
// indexation/searching.
159
if
(iPhrase == iWord) {
160
return
isToBeKept;
161
}
162
163
// Now, the word is part of the phrase, and not equal to it (and not empty).
164
165
// If the word has no more than two letters (e.g., 'de'), it should be
166
// filtered out. Indeed, when 'de' is part of 'charles de gaulle',
167
// for instance, it should not be indexed/searched alone (in a search,
168
// the resulting match score will be zero).
169
isToBeKept =
hasGoodSize
(iWord, 3);
170
if
(isToBeKept ==
false
) {
171
return
isToBeKept;
172
}
173
174
// Check whether the word is black-listed
175
isToBeKept = !
isBlackListed
(iWord);
176
177
//
178
return
isToBeKept;
179
}
180
181
}
BasConst_General.hpp
Filter.hpp
Logger.hpp
Utilities.hpp
OPENTREP
Definition
BasChronometer.cpp:10
OPENTREP::WordList_T
std::list< Word_T > WordList_T
Definition
OPENTREP_Types.hpp:690
OPENTREP::NbOfLetters_T
unsigned int NbOfLetters_T
Definition
OPENTREP_Types.hpp:705
OPENTREP::tokeniseStringIntoWordList
void tokeniseStringIntoWordList(const std::string &iPhrase, WordList_T &ioWordList)
Definition
Utilities.cpp:19
OPENTREP::createStringFromWordList
std::string createStringFromWordList(const WordList_T &iWordList, const NbOfWords_T iSplitIdx, const bool iFromBeginningFlag)
Definition
Utilities.cpp:43
OPENTREP::K_BLACK_LIST
const BlackList_T K_BLACK_LIST
Definition
BasConst.cpp:211
OPENTREP::rtrim
void rtrim(WordList_T &ioWordList, const NbOfLetters_T &iMinWordLength)
Definition
Filter.cpp:72
OPENTREP::hasGoodSize
bool hasGoodSize(const std::string &iWord, const NbOfLetters_T &iMinWordLength)
Definition
Filter.cpp:39
OPENTREP::isBlackListed
bool isBlackListed(const std::string &iWord)
Definition
Filter.cpp:53
OPENTREP::trim
void trim(WordList_T &ioWordList, const NbOfLetters_T &iMinWordLength)
Definition
Filter.cpp:122
OPENTREP::ltrim
void ltrim(WordList_T &ioWordList, const NbOfLetters_T &iMinWordLength)
Definition
Filter.cpp:97
OPENTREP::Filter
Class filtering out the words not suitable for indexing and/or searching, when part of greater string...
Definition
Filter.hpp:21
OPENTREP::Filter::trim
static void trim(std::string &ioPhrase, const NbOfLetters_T &iMinWordLength=4)
Definition
Filter.cpp:131
OPENTREP::Filter::shouldKeep
static bool shouldKeep(const std::string &iPhrase, const std::string &iWord)
Definition
Filter.cpp:144
Generated on
for OpenTREP by
1.17.0