Commit bca6baee authored by Dimitri van Heesch's avatar Dimitri van Heesch

Bug 705910 - Indexing and searching cannot treat non ASCII identifiers

parent f6bc941e
......@@ -174,7 +174,7 @@ QHP_SECT_FILTER_ATTRS =
QHG_LOCATION =
GENERATE_ECLIPSEHELP = YES
ECLIPSE_DOC_ID = org.doxygen.qtools
DISABLE_INDEX = YES
DISABLE_INDEX = NO
GENERATE_TREEVIEW = YES
ENUM_VALUES_PER_LINE = 4
TREEVIEW_WIDTH = 250
......
This diff is collapsed.
......@@ -5,7 +5,7 @@ function convertToId(search)
{
var c = search.charAt(i);
var cn = c.charCodeAt(0);
if (c.match(/[a-z0-9]/))
if (c.match(/[a-z0-9\u0080-\uFFFF]/))
{
result+=c;
}
......@@ -310,22 +310,20 @@ function SearchBox(name, resultsPath, inFrame, label)
var searchValue = this.DOMSearchField().value.replace(/^ +/, "");
var code = searchValue.toLowerCase().charCodeAt(0);
var hexCode;
if (code<16)
var idxChar = searchValue.substr(0, 1).toLowerCase();
if ( 0xD800 <= code && code <= 0xDBFF && searchValue > 1) // surrogate pair
{
hexCode="0"+code.toString(16);
}
else
{
hexCode=code.toString(16);
idxChar = searchValue.substr(0, 2);
}
var resultsPage;
var resultsPageWithSearch;
var hasResultsPage;
if (indexSectionsWithContent[this.searchIndex].charAt(code) == '1')
var idx = indexSectionsWithContent[this.searchIndex].indexOf(idxChar);
if (idx!=-1)
{
var hexCode=idx.toString(16);
resultsPage = this.resultsPath + '/' + indexSectionNames[this.searchIndex] + '_' + hexCode + '.html';
resultsPageWithSearch = resultsPage+'?'+escape(searchValue);
hasResultsPage = true;
......
......@@ -358,7 +358,7 @@ function main()
$sorted = run_query($query);
// Now output the HTML stuff...
// End the HTML form
end_form(preg_replace("/[^a-zA-Z0-9\-\_\.]/i", " ", $query ));
end_form(preg_replace("/[^a-zA-Z0-9\-\_\.\x80-\xFF]/i", " ", $query ));
// report results to the user
report_results($sorted);
end_page();
......
......@@ -358,7 +358,7 @@
" $sorted = run_query($query);\n"
" // Now output the HTML stuff...\n"
" // End the HTML form\n"
" end_form(preg_replace(\"/[^a-zA-Z0-9\\-\\_\\.]/i\", \" \", $query ));\n"
" end_form(preg_replace(\"/[^a-zA-Z0-9\\-\\_\\.\\x80-\\xFF]/i\", \" \", $query ));\n"
" // report results to the user\n"
" report_results($sorted);\n"
" end_page();\n"
......
......@@ -5,7 +5,7 @@
" {\n"
" var c = search.charAt(i);\n"
" var cn = c.charCodeAt(0);\n"
" if (c.match(/[a-z0-9]/))\n"
" if (c.match(/[a-z0-9\\u0080-\\uFFFF]/))\n"
" {\n"
" result+=c;\n"
" }\n"
......@@ -310,22 +310,20 @@
" var searchValue = this.DOMSearchField().value.replace(/^ +/, \"\");\n"
"\n"
" var code = searchValue.toLowerCase().charCodeAt(0);\n"
" var hexCode;\n"
" if (code<16) \n"
" var idxChar = searchValue.substr(0, 1).toLowerCase();\n"
" if ( 0xD800 <= code && code <= 0xDBFF && searchValue > 1) // surrogate pair\n"
" {\n"
" hexCode=\"0\"+code.toString(16);\n"
" }\n"
" else \n"
" {\n"
" hexCode=code.toString(16);\n"
" idxChar = searchValue.substr(0, 2);\n"
" }\n"
"\n"
" var resultsPage;\n"
" var resultsPageWithSearch;\n"
" var hasResultsPage;\n"
"\n"
" if (indexSectionsWithContent[this.searchIndex].charAt(code) == '1')\n"
" var idx = indexSectionsWithContent[this.searchIndex].indexOf(idxChar);\n"
" if (idx!=-1)\n"
" {\n"
" var hexCode=idx.toString(16);\n"
" resultsPage = this.resultsPath + '/' + indexSectionNames[this.searchIndex] + '_' + hexCode + '.html';\n"
" resultsPageWithSearch = resultsPage+'?'+escape(searchValue);\n"
" hasResultsPage = true;\n"
......
This diff is collapsed.
......@@ -108,7 +108,7 @@ class SDict
* \param caseSensitive indicated whether the keys should be sorted
* in a case sensitive way.
*/
SDict(int size,bool caseSensitive=TRUE) : m_sizeIndex(0)
SDict(int size=17,bool caseSensitive=TRUE) : m_sizeIndex(0)
{
m_list = new SList<T>(this);
#if AUTORESIZE
......@@ -454,7 +454,7 @@ class SIntDict
* \param size The size of the dictionary. Should be a prime number for
* best distribution of elements.
*/
SIntDict(int size) : m_sizeIndex(0)
SIntDict(int size=17) : m_sizeIndex(0)
{
m_list = new SIntList<T>(this);
#if AUTORESIZE
......@@ -636,7 +636,7 @@ class SIntDict
{
return m_li->current();
}
/*! Moves the iterator to the next element.
* \return the new "current" element, or zero if the iterator was
* already pointing at the last element.
......@@ -659,6 +659,76 @@ class SIntDict
QListIterator<T> *m_li;
};
class IteratorDict; // first forward declare
friend class IteratorDict; // then make it a friend
/*! Simple iterator for SDict. It iterates over the dictionary elements
* in an unsorted way, but does provide information about the element's key.
*/
class IteratorDict
{
public:
/*! Create an iterator given the dictionary. */
IteratorDict(const SIntDict<T> &dict)
{
m_di = new QIntDictIterator<T>(*dict.m_dict);
}
/*! Destroys the dictionary */
virtual ~IteratorDict()
{
delete m_di;
}
/*! Set the iterator to the first element in the list.
* \return The first compound, or zero if the list was empty.
*/
T *toFirst() const
{
return m_di->toFirst();
}
/*! Set the iterator to the last element in the list.
* \return The first compound, or zero if the list was empty.
*/
T *toLast() const
{
return m_di->toLast();
}
/*! Returns the current compound */
T *current() const
{
return m_di->current();
}
/*! Returns the current key */
int currentKey() const
{
return m_di->currentKey();
}
/*! Moves the iterator to the next element.
* \return the new "current" element, or zero if the iterator was
* already pointing at the last element.
*/
T *operator++()
{
return m_di->operator++();
}
/*! Moves the iterator to the previous element.
* \return the new "current" element, or zero if the iterator was
* already pointing at the first element.
*/
T *operator--()
{
return m_di->operator--();
}
private:
QDictIterator<T> *m_di;
};
};
#endif
......@@ -7919,3 +7919,72 @@ void addDocCrossReference(MemberDef *src,MemberDef *dst)
}
}
//--------------------------------------------------------------------------------------
/*! @brief Get one unicode character as an unsigned integer from utf-8 string
*
* @param s utf-8 encoded string
* @param idx byte position of given string \a s.
* @return the unicode codepoint, 0 - MAX_UNICODE_CODEPOINT
* @see getNextUtf8OrToLower()
* @see getNextUtf8OrToUpper()
*/
uint getUtf8Code( const QCString& s, int idx )
{
const int length = s.length();
if (idx >= length) { return 0; }
const uint c0 = (uchar)s.at(idx);
if ( c0 < 0xC2 || c0 >= 0xF8 ) // 1 byte character
{
return c0;
}
if (idx+1 >= length) { return 0; }
const uint c1 = ((uchar)s.at(idx+1)) & 0x3f;
if ( c0 < 0xE0 ) // 2 byte character
{
return ((c0 & 0x1f) << 6) | c1;
}
if (idx+2 >= length) { return 0; }
const uint c2 = ((uchar)s.at(idx+2)) & 0x3f;
if ( c0 < 0xF0 ) // 3 byte character
{
return ((c0 & 0x0f) << 12) | (c1 << 6) | c2;
}
if (idx+3 >= length) { return 0; }
// 4 byte character
const uint c3 = ((uchar)s.at(idx+3)) & 0x3f;
return ((c0 & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
}
/*! @brief Returns one unicode character as an unsigned integer
* from utf-8 string, making the character lower case if it was upper case.
*
* @param s utf-8 encoded string
* @param idx byte position of given string \a s.
* @return the unicode codepoint, 0 - MAX_UNICODE_CODEPOINT, excludes 'A'-'Z'
* @see getNextUtf8Code()
*/
uint getUtf8CodeToLower( const QCString& s, int idx )
{
const uint v = getUtf8Code( s, idx );
return v < 0x7f ? tolower( v ) : v;
}
/*! @brief Returns one unicode character as ian unsigned interger
* from utf-8 string, making the character upper case if it was lower case.
*
* @param s utf-8 encoded string
* @param idx byte position of given string \a s.
* @return the unicode codepoint, 0 - MAX_UNICODE_CODEPOINT, excludes 'A'-'Z'
* @see getNextUtf8Code()
*/
uint getUtf8CodeToUpper( const QCString& s, int idx )
{
const uint v = getUtf8Code( s, idx );
return v < 0x7f ? toupper( v ) : v;
}
//--------------------------------------------------------------------------------------
......@@ -25,6 +25,7 @@
#include <qlist.h>
#include <ctype.h>
#include "types.h"
#include "sortdict.h"
//--------------------------------------------------------------------
......@@ -87,6 +88,33 @@ class TextGeneratorOLImpl : public TextGeneratorIntf
//--------------------------------------------------------------------
/** @brief maps a unicode character code to a list of T::ElementType's
*/
template<class T>
class LetterToIndexMap : public SIntDict<T>
{
public:
LetterToIndexMap() { SIntDict<T>::setAutoDelete(TRUE); }
int compareItems(QCollection::Item item1, QCollection::Item item2)
{
T *l1=(T *)item1;
T *l2=(T *)item2;
return (int)l1->letter()-(int)l2->letter();
}
void append(uint letter,typename T::ElementType *elem)
{
T *l = SIntDict<T>::find((int)letter);
if (l==0)
{
l = new T(letter);
SIntDict<T>::inSort((int)letter,l);
}
l->append(elem);
}
};
//--------------------------------------------------------------------
QCString langToString(SrcLangExt lang);
QCString getLanguageSpecificSeparator(SrcLangExt lang,bool classScope=FALSE);
......@@ -411,5 +439,9 @@ bool fileVisibleInIndex(FileDef *fd,bool &genSourceFile);
void addDocCrossReference(MemberDef *src,MemberDef *dst);
uint getUtf8Code( const QCString& s, int idx );
uint getUtf8CodeToLower( const QCString& s, int idx );
uint getUtf8CodeToUpper( const QCString& s, int idx );
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment