From 8db0e1abee5c8e9aa09014aaa7dea6aa66383f85 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Thu, 26 Nov 2020 15:22:20 +0100
Subject: [PATCH] Speedup some unicode rendering

Use a fast path for column width computation for ascii characters. Especially
relevant for llvm-objdump.

before:

    % time ./bin/llvm-objdump -D  -j .text /lib/libc.so.6 >/dev/null
    ./bin/llvm-objdump -D -j .text /lib/libc.so.6 > /dev/null  0.75s user 0.01s system 99% cpu 0.757 total

after:

    % time ./bin/llvm-objdump -D  -j .text /lib/libc.so.6 >/dev/null
    ./bin/llvm-objdump -D -j .text /lib/libc.so.6 > /dev/null  0.37s user 0.01s system 99% cpu 0.378 total

Differential Revision: https://reviews.llvm.org/D92180
---
 lib/Support/Unicode.cpp           | 11 +++++++++++
 unittests/Support/UnicodeTest.cpp | 15 +++++++++++++++
 2 files changed, 26 insertions(+)
diff --git a/lib/Support/Unicode.cpp b/lib/Support/Unicode.cpp
index 4d195069682..bb6e75555b4 100644
--- a/lib/Support/Unicode.cpp
+++ b/lib/Support/Unicode.cpp
@@ -339,11 +339,22 @@ static inline int charWidth(int UCS)
   return 1;
 }
 
+static bool isprintableascii(char c) { return c > 31 && c < 127; }
+
 int columnWidthUTF8(StringRef Text) {
   unsigned ColumnWidth = 0;
   unsigned Length;
   for (size_t i = 0, e = Text.size(); i < e; i += Length) {
     Length = getNumBytesForUTF8(Text[i]);
+
+    // fast path for ASCII characters
+    if (Length == 1) {
+      if (!isprintableascii(Text[i]))
+        return ErrorNonPrintableCharacter;
+      ColumnWidth += 1;
+      continue;
+    }
+
     if (Length <= 0 || i + Length > Text.size())
       return ErrorInvalidUTF8;
     UTF32 buf[1];
diff --git a/unittests/Support/UnicodeTest.cpp b/unittests/Support/UnicodeTest.cpp
index 376fbee4ae6..6ce323dc8f3 100644
--- a/unittests/Support/UnicodeTest.cpp
+++ b/unittests/Support/UnicodeTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Unicode.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "gtest/gtest.h"
 
 namespace llvm {
@@ -23,6 +24,7 @@ TEST(Unicode, columnWidthUTF8) {
   EXPECT_EQ(6, columnWidthUTF8("abcdef"));
 
   EXPECT_EQ(-1, columnWidthUTF8("\x01"));
+  EXPECT_EQ(-1, columnWidthUTF8("\t"));
   EXPECT_EQ(-1, columnWidthUTF8("aaaaaaaaaa\x01"));
   EXPECT_EQ(-1, columnWidthUTF8("\342\200\213")); // 200B ZERO WIDTH SPACE
 
@@ -84,6 +86,19 @@ TEST(Unicode, isPrintable) {
   EXPECT_TRUE(isPrintable(0x20000));  // CJK UNIFIED IDEOGRAPH-20000
 
   EXPECT_FALSE(isPrintable(0x10FFFF)); // noncharacter
+
+  // test the validity of a fast path in columnWidthUTF8
+  for (unsigned char c = 0; c < 128; ++c) {
+    const UTF8 buf8[2] = {c, 0};
+    const UTF8 *Target8 = &buf8[0];
+    UTF32 buf32[1];
+    UTF32 *Target32 = &buf32[0];
+    auto status = ConvertUTF8toUTF32(&Target8, Target8 + 1, &Target32,
+                                     Target32 + 1, strictConversion);
+    EXPECT_TRUE(status == conversionOK);
+    EXPECT_TRUE((columnWidthUTF8(reinterpret_cast<const char *>(buf8)) == 1) ==
+                (bool)isPrintable(buf32[0]));
+  }
 }
 
 } // namespace