Bug 705910 - Indexing and searching cannot treat non ASCII identifiers

bca6baee · Dimitri van Heesch · f6bc941e · bca6baee · bca6baee · bca6baee
Commit bca6baee authored Sep 14, 2013 by Dimitri van Heesch
10 changed files
--- a/qtools/Doxyfile
+++ b/qtools/Doxyfile
@@ -174,7 +174,7 @@ QHP_SECT_FILTER_ATTRS  =
 QHG_LOCATION           =
 GENERATE_ECLIPSEHELP   = YES
 ECLIPSE_DOC_ID         = org.doxygen.qtools
-DISABLE_INDEX          = YES
+DISABLE_INDEX          = NO
 GENERATE_TREEVIEW      = YES
 ENUM_VALUES_PER_LINE   = 4
 TREEVIEW_WIDTH         = 250

--- a/src/index.cpp
+++ b/src/index.cpp
--- a/src/search.js
+++ b/src/search.js
@@ -5,7 +5,7 @@ function convertToId(search)
  {
    var c = search.charAt(i);
    var cn = c.charCodeAt(0);
-    if (c.match(/[a-z0-9]/))
+    if (c.match(/[a-z0-9\u0080-\uFFFF]/))
    {
      result+=c;
    }
@@ -310,22 +310,20 @@ function SearchBox(name, resultsPath, inFrame, label)
    var searchValue = this.DOMSearchField().value.replace(/^ +/, "");
    var code = searchValue.toLowerCase().charCodeAt(0);
-    var hexCode;
+    var idxChar = searchValue.substr(0, 1).toLowerCase();
-    if (code<16) 
+    if ( 0xD800 <= code && code <= 0xDBFF && searchValue > 1) // surrogate pair
    {
-      hexCode="0"+code.toString(16);
+      idxChar = searchValue.substr(0, 2);
-    }
-    else 
-    {
-      hexCode=code.toString(16);
    }
    var resultsPage;
    var resultsPageWithSearch;
    var hasResultsPage;
-    if (indexSectionsWithContent[this.searchIndex].charAt(code) == '1')
+    var idx = indexSectionsWithContent[this.searchIndex].indexOf(idxChar);
+    if (idx!=-1)
    {
+       var hexCode=idx.toString(16);
       resultsPage = this.resultsPath + '/' + indexSectionNames[this.searchIndex] + '_' + hexCode + '.html';
       resultsPageWithSearch = resultsPage+'?'+escape(searchValue);
       hasResultsPage = true;

--- a/src/search_functions.php
+++ b/src/search_functions.php
@@ -358,7 +358,7 @@ function main()
  $sorted = run_query($query);
  // Now output the HTML stuff...
  // End the HTML form
-  end_form(preg_replace("/[^a-zA-Z0-9\-\_\.]/i", " ", $query ));
+  end_form(preg_replace("/[^a-zA-Z0-9\-\_\.\x80-\xFF]/i", " ", $query ));
  // report results to the user
  report_results($sorted);
  end_page();

--- a/src/search_functions_php.h
+++ b/src/search_functions_php.h
@@ -358,7 +358,7 @@
 "  $sorted = run_query($query);\n"
 "  // Now output the HTML stuff...\n"
 "  // End the HTML form\n"
-"  end_form(preg_replace(\"/[^a-zA-Z0-9\\-\\_\\.]/i\", \" \", $query ));\n"
+"  end_form(preg_replace(\"/[^a-zA-Z0-9\\-\\_\\.\\x80-\\xFF]/i\", \" \", $query ));\n"
 "  // report results to the user\n"
 "  report_results($sorted);\n"
 "  end_page();\n"

--- a/src/search_js.h
+++ b/src/search_js.h
@@ -5,7 +5,7 @@
 "  {\n"
 "    var c = search.charAt(i);\n"
 "    var cn = c.charCodeAt(0);\n"
-"    if (c.match(/[a-z0-9]/))\n"
+"    if (c.match(/[a-z0-9\\u0080-\\uFFFF]/))\n"
 "    {\n"
 "      result+=c;\n"
 "    }\n"
@@ -310,22 +310,20 @@
 "    var searchValue = this.DOMSearchField().value.replace(/^ +/, \"\");\n"
 "\n"
 "    var code = searchValue.toLowerCase().charCodeAt(0);\n"
-"    var hexCode;\n"
+"    var idxChar = searchValue.substr(0, 1).toLowerCase();\n"
-"    if (code<16) \n"
+"    if ( 0xD800 <= code && code <= 0xDBFF && searchValue > 1) // surrogate pair\n"
 "    {\n"
-"      hexCode=\"0\"+code.toString(16);\n"
+"      idxChar = searchValue.substr(0, 2);\n"
-"    }\n"
-"    else \n"
-"    {\n"
-"      hexCode=code.toString(16);\n"
 "    }\n"
 "\n"
 "    var resultsPage;\n"
 "    var resultsPageWithSearch;\n"
 "    var hasResultsPage;\n"
 "\n"
-"    if (indexSectionsWithContent[this.searchIndex].charAt(code) == '1')\n"
+"    var idx = indexSectionsWithContent[this.searchIndex].indexOf(idxChar);\n"
+"    if (idx!=-1)\n"
 "    {\n"
+"       var hexCode=idx.toString(16);\n"
 "       resultsPage = this.resultsPath + '/' + indexSectionNames[this.searchIndex] + '_' + hexCode + '.html';\n"
 "       resultsPageWithSearch = resultsPage+'?'+escape(searchValue);\n"
 "       hasResultsPage = true;\n"

--- a/src/searchindex.cpp
+++ b/src/searchindex.cpp
--- a/src/sortdict.h
+++ b/src/sortdict.h
@@ -108,7 +108,7 @@ class SDict
     *  \param caseSensitive indicated whether the keys should be sorted
     *         in a case sensitive way.
     */
-    SDict(int size,bool caseSensitive=TRUE) : m_sizeIndex(0)
+    SDict(int size=17,bool caseSensitive=TRUE) : m_sizeIndex(0)
    {
      m_list = new SList<T>(this);
 #if AUTORESIZE
@@ -454,7 +454,7 @@ class SIntDict
     *  \param size The size of the dictionary. Should be a prime number for
     *              best distribution of elements.
     */
-    SIntDict(int size) : m_sizeIndex(0)
+    SIntDict(int size=17) : m_sizeIndex(0)
    {
      m_list = new SIntList<T>(this);
 #if AUTORESIZE
@@ -636,7 +636,7 @@ class SIntDict
        {
          return m_li->current();
        }
        /*! Moves the iterator to the next element.
         *  \return the new "current" element, or zero if the iterator was
         *          already pointing at the last element.
@@ -659,6 +659,76 @@ class SIntDict
        QListIterator<T> *m_li;
    };
+    class IteratorDict;         // first forward declare
+    friend class IteratorDict;  // then make it a friend
+    /*! Simple iterator for SDict. It iterates over the dictionary elements
+     *  in an unsorted way, but does provide information about the element's key.
+     */
+    class IteratorDict
+    {
+      public:
+        /*! Create an iterator given the dictionary. */
+        IteratorDict(const SIntDict<T> &dict)
+        {
+          m_di = new QIntDictIterator<T>(*dict.m_dict);
+        }
+        /*! Destroys the dictionary */
+        virtual ~IteratorDict()
+        {
+          delete m_di;
+        }
+        /*! Set the iterator to the first element in the list. 
+         *  \return The first compound, or zero if the list was empty. 
+         */
+        T *toFirst() const
+        {
+          return m_di->toFirst();
+        }
+        /*! Set the iterator to the last element in the list. 
+         *  \return The first compound, or zero if the list was empty. 
+         */
+        T *toLast() const
+        {
+          return m_di->toLast();
+        }
+        /*! Returns the current compound */
+        T *current() const
+        {
+          return m_di->current();
+        }
+        /*! Returns the current key */
+        int currentKey() const
+        {
+          return m_di->currentKey();
+        }
+        /*! Moves the iterator to the next element.
+         *  \return the new "current" element, or zero if the iterator was
+         *          already pointing at the last element.
+         */
+        T *operator++()
+        {
+          return m_di->operator++();
+        }
+        /*! Moves the iterator to the previous element.
+         *  \return the new "current" element, or zero if the iterator was
+         *          already pointing at the first element.
+         */
+        T *operator--()
+        {
+          return m_di->operator--();
+        }
+      private:
+        QDictIterator<T> *m_di;
+    };
 };
 #endif
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -7919,3 +7919,72 @@ void addDocCrossReference(MemberDef *src,MemberDef *dst)
  }
 }
+//--------------------------------------------------------------------------------------
+/*! @brief Get one unicode character as an unsigned integer from utf-8 string
+ *
+ * @param s utf-8 encoded string
+ * @param idx byte position of given string \a s.
+ * @return the unicode codepoint, 0 - MAX_UNICODE_CODEPOINT
+ * @see getNextUtf8OrToLower()
+ * @see getNextUtf8OrToUpper()
+ */
+uint getUtf8Code( const QCString& s, int idx )
+{
+  const int length = s.length();
+  if (idx >= length) { return 0; }
+  const uint c0 = (uchar)s.at(idx);
+  if ( c0 < 0xC2 || c0 >= 0xF8 ) // 1 byte character
+  {
+    return c0;
+  }
+  if (idx+1 >= length) { return 0; }
+  const uint c1 = ((uchar)s.at(idx+1)) & 0x3f;
+  if ( c0 < 0xE0 ) // 2 byte character
+  {
+    return ((c0 & 0x1f) << 6) | c1;
+  }
+  if (idx+2 >= length) { return 0; }
+  const uint c2 = ((uchar)s.at(idx+2)) & 0x3f;
+  if ( c0 < 0xF0 ) // 3 byte character
+  {
+    return ((c0 & 0x0f) << 12) | (c1 << 6) | c2;
+  }
+  if (idx+3 >= length) { return 0; }
+  // 4 byte character
+  const uint c3 = ((uchar)s.at(idx+3)) & 0x3f;
+  return ((c0 & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
+}
+/*! @brief Returns one unicode character as an unsigned integer 
+ *  from utf-8 string, making the character lower case if it was upper case.
+ *
+ * @param s utf-8 encoded string
+ * @param idx byte position of given string \a s.
+ * @return the unicode codepoint, 0 - MAX_UNICODE_CODEPOINT, excludes 'A'-'Z'
+ * @see getNextUtf8Code()
+*/
+uint getUtf8CodeToLower( const QCString& s, int idx )
+{
+  const uint v = getUtf8Code( s, idx );
+  return v < 0x7f ? tolower( v ) : v;
+}
+/*! @brief Returns one unicode character as ian unsigned interger 
+ *  from utf-8 string, making the character upper case if it was lower case.
+ *
+ * @param s utf-8 encoded string
+ * @param idx byte position of given string \a s.
+ * @return the unicode codepoint, 0 - MAX_UNICODE_CODEPOINT, excludes 'A'-'Z'
+ * @see getNextUtf8Code()
+ */
+uint getUtf8CodeToUpper( const QCString& s, int idx )
+{
+  const uint v = getUtf8Code( s, idx );
+  return v < 0x7f ? toupper( v ) : v;
+}
+//--------------------------------------------------------------------------------------
--- a/src/util.h
+++ b/src/util.h
@@ -25,6 +25,7 @@
 #include <qlist.h>
 #include <ctype.h>
 #include "types.h"
+#include "sortdict.h"
 //--------------------------------------------------------------------
@@ -87,6 +88,33 @@ class TextGeneratorOLImpl : public TextGeneratorIntf
 //--------------------------------------------------------------------
+/** @brief maps a unicode character code to a list of T::ElementType's
+ */
+template<class T>
+class LetterToIndexMap : public SIntDict<T>
+{
+  public:
+    LetterToIndexMap() { SIntDict<T>::setAutoDelete(TRUE); }
+    int compareItems(QCollection::Item item1, QCollection::Item item2)
+    {
+      T *l1=(T *)item1;
+      T *l2=(T *)item2;
+      return (int)l1->letter()-(int)l2->letter();
+    }
+    void append(uint letter,typename T::ElementType *elem)
+    {
+      T *l = SIntDict<T>::find((int)letter);
+      if (l==0)
+      {
+        l = new T(letter);
+        SIntDict<T>::inSort((int)letter,l);
+      }
+      l->append(elem);
+    }
+};
+//--------------------------------------------------------------------
 QCString langToString(SrcLangExt lang);
 QCString getLanguageSpecificSeparator(SrcLangExt lang,bool classScope=FALSE);
@@ -411,5 +439,9 @@ bool fileVisibleInIndex(FileDef *fd,bool &genSourceFile);
 void addDocCrossReference(MemberDef *src,MemberDef *dst);
+uint getUtf8Code( const QCString& s, int idx );
+uint getUtf8CodeToLower( const QCString& s, int idx );
+uint getUtf8CodeToUpper( const QCString& s, int idx );
 #endif