QGIS API Documentation  2.14.11-Essen
qgsstringutils.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  qgsstringutils.cpp
3  ------------------
4  begin : June 2015
5  copyright : (C) 2015 by Nyall Dawson
6  email : nyall dot dawson at gmail dot com
7  ***************************************************************************
8  * *
9  * This program is free software; you can redistribute it and/or modify *
10  * it under the terms of the GNU General Public License as published by *
11  * the Free Software Foundation; either version 2 of the License, or *
12  * (at your option) any later version. *
13  * *
14  ***************************************************************************/
15 
16 #include "qgsstringutils.h"
17 #include <QVector>
18 #include <QRegExp>
19 #include <QStringList>
20 #include <QTextBoundaryFinder>
21 
23 {
24  if ( string.isEmpty() )
25  return QString();
26 
27  switch ( capitalization )
28  {
29  case MixedCase:
30  return string;
31 
32  case AllUppercase:
33  return string.toUpper();
34 
35  case AllLowercase:
36  return string.toLower();
37 
39  {
40  QString temp = string;
41 
42  QTextBoundaryFinder wordSplitter( QTextBoundaryFinder::Word, string.constData(), string.length(), 0, 0 );
43  QTextBoundaryFinder letterSplitter( QTextBoundaryFinder::Grapheme, string.constData(), string.length(), 0, 0 );
44 
45  wordSplitter.setPosition( 0 );
46  bool first = true;
47 #if QT_VERSION >= 0x050000
48  while (( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartOfItem )
49  || wordSplitter.toNextBoundary() >= 0 )
50 #else
51  while (( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartWord )
52  || wordSplitter.toNextBoundary() >= 0 )
53 #endif
54  {
55  first = false;
56  letterSplitter.setPosition( wordSplitter.position() );
57  letterSplitter.toNextBoundary();
58  QString substr = string.mid( wordSplitter.position(), letterSplitter.position() - wordSplitter.position() );
59  temp.replace( wordSplitter.position(), substr.length(), substr.toUpper() );
60  }
61  return temp;
62  }
63 
64  }
65  // no warnings
66  return string;
67 }
68 
69 int QgsStringUtils::levenshteinDistance( const QString& string1, const QString& string2, bool caseSensitive )
70 {
71  int length1 = string1.length();
72  int length2 = string2.length();
73 
74  //empty strings? solution is trivial...
75  if ( string1.isEmpty() )
76  {
77  return length2;
78  }
79  else if ( string2.isEmpty() )
80  {
81  return length1;
82  }
83 
84  //handle case sensitive flag (or not)
85  QString s1( caseSensitive ? string1 : string1.toLower() );
86  QString s2( caseSensitive ? string2 : string2.toLower() );
87 
88  const QChar* s1Char = s1.constData();
89  const QChar* s2Char = s2.constData();
90 
91  //strip out any common prefix
92  int commonPrefixLen = 0;
93  while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
94  {
95  commonPrefixLen++;
96  length1--;
97  length2--;
98  s1Char++;
99  s2Char++;
100  }
101 
102  //strip out any common suffix
103  while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
104  {
105  length1--;
106  length2--;
107  }
108 
109  //fully checked either string? if so, the answer is easy...
110  if ( length1 == 0 )
111  {
112  return length2;
113  }
114  else if ( length2 == 0 )
115  {
116  return length1;
117  }
118 
119  //ensure the inner loop is longer
120  if ( length1 > length2 )
121  {
122  qSwap( s1, s2 );
123  qSwap( length1, length2 );
124  }
125 
126  //levenshtein algorithm begins here
127  QVector< int > col;
128  col.fill( 0, length2 + 1 );
129  QVector< int > prevCol;
130  prevCol.reserve( length2 + 1 );
131  for ( int i = 0; i < length2 + 1; ++i )
132  {
133  prevCol << i;
134  }
135  const QChar* s2start = s2Char;
136  for ( int i = 0; i < length1; ++i )
137  {
138  col[0] = i + 1;
139  s2Char = s2start;
140  for ( int j = 0; j < length2; ++j )
141  {
142  col[j + 1] = qMin( qMin( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + (( *s1Char == *s2Char ) ? 0 : 1 ) );
143  s2Char++;
144  }
145  col.swap( prevCol );
146  s1Char++;
147  }
148  return prevCol[length2];
149 }
150 
151 QString QgsStringUtils::longestCommonSubstring( const QString& string1, const QString& string2, bool caseSensitive )
152 {
153  if ( string1.isEmpty() || string2.isEmpty() )
154  {
155  //empty strings, solution is trivial...
156  return QString();
157  }
158 
159  //handle case sensitive flag (or not)
160  QString s1( caseSensitive ? string1 : string1.toLower() );
161  QString s2( caseSensitive ? string2 : string2.toLower() );
162 
163  if ( s1 == s2 )
164  {
165  //another trivial case, identical strings
166  return s1;
167  }
168 
169  int* currentScores = new int [ s2.length()];
170  int* previousScores = new int [ s2.length()];
171  int maxCommonLength = 0;
172  int lastMaxBeginIndex = 0;
173 
174  const QChar* s1Char = s1.constData();
175  const QChar* s2Char = s2.constData();
176  const QChar* s2Start = s2Char;
177 
178  for ( int i = 0; i < s1.length(); ++i )
179  {
180  for ( int j = 0; j < s2.length(); ++j )
181  {
182  if ( *s1Char != *s2Char )
183  {
184  currentScores[j] = 0;
185  }
186  else
187  {
188  if ( i == 0 || j == 0 )
189  {
190  currentScores[j] = 1;
191  }
192  else
193  {
194  currentScores[j] = 1 + previousScores[j - 1];
195  }
196 
197  if ( maxCommonLength < currentScores[j] )
198  {
199  maxCommonLength = currentScores[j];
200  lastMaxBeginIndex = i;
201  }
202  }
203  s2Char++;
204  }
205  qSwap( currentScores, previousScores );
206  s1Char++;
207  s2Char = s2Start;
208  }
209  delete [] currentScores;
210  delete [] previousScores;
211  return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
212 }
213 
214 int QgsStringUtils::hammingDistance( const QString& string1, const QString& string2, bool caseSensitive )
215 {
216  if ( string1.isEmpty() && string2.isEmpty() )
217  {
218  //empty strings, solution is trivial...
219  return 0;
220  }
221 
222  if ( string1.length() != string2.length() )
223  {
224  //invalid inputs
225  return -1;
226  }
227 
228  //handle case sensitive flag (or not)
229  QString s1( caseSensitive ? string1 : string1.toLower() );
230  QString s2( caseSensitive ? string2 : string2.toLower() );
231 
232  if ( s1 == s2 )
233  {
234  //another trivial case, identical strings
235  return 0;
236  }
237 
238  int distance = 0;
239  const QChar* s1Char = s1.constData();
240  const QChar* s2Char = s2.constData();
241 
242  for ( int i = 0; i < string1.length(); ++i )
243  {
244  if ( *s1Char != *s2Char )
245  distance++;
246  s1Char++;
247  s2Char++;
248  }
249 
250  return distance;
251 }
252 
254 {
255  if ( string.isEmpty() )
256  return QString();
257 
258  QString tmp = string.toUpper();
259 
260  //strip non character codes, and vowel like characters after the first character
261  QChar* char1 = tmp.data();
262  QChar* char2 = tmp.data();
263  int outLen = 0;
264  for ( int i = 0; i < tmp.length(); ++i, ++char2 )
265  {
266  if (( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || (( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
267  && ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
268  && ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
269  && ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
270  {
271  *char1 = *char2;
272  char1++;
273  outLen++;
274  }
275  }
276  tmp.truncate( outLen );
277 
278  QChar* tmpChar = tmp.data();
279  tmpChar++;
280  for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
281  {
282  switch (( *tmpChar ).unicode() )
283  {
284  case 0x42:
285  case 0x46:
286  case 0x50:
287  case 0x56:
288  tmp.replace( i, 1, QChar( 0x31 ) );
289  break;
290 
291  case 0x43:
292  case 0x47:
293  case 0x4A:
294  case 0x4B:
295  case 0x51:
296  case 0x53:
297  case 0x58:
298  case 0x5A:
299  tmp.replace( i, 1, QChar( 0x32 ) );
300  break;
301 
302  case 0x44:
303  case 0x54:
304  tmp.replace( i, 1, QChar( 0x33 ) );
305  break;
306 
307  case 0x4C:
308  tmp.replace( i, 1, QChar( 0x34 ) );
309  break;
310 
311  case 0x4D:
312  case 0x4E:
313  tmp.replace( i, 1, QChar( 0x35 ) );
314  break;
315 
316  case 0x52:
317  tmp.replace( i, 1, QChar( 0x36 ) );
318  break;
319  }
320  }
321 
322  //remove adjacent duplicates
323  char1 = tmp.data();
324  char2 = tmp.data();
325  char2++;
326  outLen = 1;
327  for ( int i = 1; i < tmp.length(); ++i, ++char2 )
328  {
329  if ( *char2 != *char1 )
330  {
331  char1++;
332  *char1 = *char2;
333  outLen++;
334  if ( outLen == 4 )
335  break;
336  }
337  }
338  tmp.truncate( outLen );
339  if ( tmp.length() < 4 )
340  {
341  tmp.append( "000" );
342  tmp.truncate( 4 );
343  }
344 
345  return tmp;
346 }
static QString longestCommonSubstring(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the longest common substring between two strings.
QString & append(QChar ch)
QString toUpper() const
void truncate(int position)
QVector< T > & fill(const T &value, int size)
static QString soundex(const QString &string)
Returns the Soundex representation of a string.
static QString capitalize(const QString &string, Capitalization capitalization)
Converts a string by applying capitalization rules to the string.
bool isEmpty() const
void setPosition(int position)
static int levenshteinDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Levenshtein edit distance between two strings.
Convert just the first letter of each word to uppercase, leave the rest untouched.
Convert all characters to uppercase.
QString toLower() const
Capitalization
Capitalization options.
void reserve(int size)
QString & replace(int position, int n, QChar after)
QString mid(int position, int n) const
Mixed case, ie no change.
int length() const
int position() const
void swap(QVector< T > &other)
Convert all characters to lowercase.
BoundaryReasons boundaryReasons() const
QChar * data()
static int hammingDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Hamming distance between two strings.