[Support] Add StringRef::consumeInteger.

StringRef::getInteger() exists and treats the entire string as an integer of the specified radix, failing if any invalid characters are encountered or the number overflows. Sometimes you might have something like "123456foo" and you want to get the number 123456 and leave the string "foo" remaining. This is similar to what would be possible by using the standard runtime library functions strtoul et al and specifying an end pointer. This patch adds consumeInteger(), which does exactly that. It consumes as much as possible until an invalid character is found, and modifies the StringRef in place so that upon return only the portion of the StringRef after the number remains. Differential Revision: https://reviews.llvm.org/D24778 llvm-svn: 282164
2024-11-23 11:13:28 +01:00 · 2016-09-22 15:05:19 +00:00 · 2016-09-22 15:05:19 +00:00 · 3d1c3fcd78
commit 3d1c3fcd78
parent f1efb9897d
3 changed files with 267 additions and 25 deletions
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@ -32,6 +32,10 @@ namespace llvm {

  bool getAsSignedInteger(StringRef Str, unsigned Radix, long long &Result);

+  bool consumeUnsignedInteger(StringRef &Str, unsigned Radix,
+                              unsigned long long &Result);
+  bool consumeSignedInteger(StringRef &Str, unsigned Radix, long long &Result);
+
  /// StringRef - Represent a constant reference to a string, i.e. a character
  /// array and a length, which need not be null terminated.
  ///
@ -397,6 +401,37 @@ namespace llvm {
      return false;
    }

+    /// Parse the current string as an integer of the specified radix.  If
+    /// \p Radix is specified as zero, this does radix autosensing using
+    /// extended C rules: 0 is octal, 0x is hex, 0b is binary.
+    ///
+    /// If the string does not begin with a number of the specified radix,
+    /// this returns true to signify the error. The string is considered
+    /// erroneous if empty or if it overflows T.
+    /// The portion of the string representing the discovered numeric value
+    /// is removed from the beginning of the string.
+    template <typename T>
+    typename std::enable_if<std::numeric_limits<T>::is_signed, bool>::type
+    consumeInteger(unsigned Radix, T &Result) {
+      long long LLVal;
+      if (consumeSignedInteger(*this, Radix, LLVal) ||
+          static_cast<long long>(static_cast<T>(LLVal)) != LLVal)
+        return true;
+      Result = LLVal;
+      return false;
+    }
+
+    template <typename T>
+    typename std::enable_if<!std::numeric_limits<T>::is_signed, bool>::type
+    consumeInteger(unsigned Radix, T &Result) {
+      unsigned long long ULLVal;
+      if (consumeUnsignedInteger(*this, Radix, ULLVal) ||
+          static_cast<long long>(static_cast<T>(ULLVal)) != ULLVal)
+        return true;
+      Result = ULLVal;
+      return false;
+    }
+
    /// Parse the current string as an integer of the specified \p Radix, or of
    /// an autosensed radix if the \p Radix given is 0.  The current value in
    /// \p Result is discarded, and the storage is changed to be wide enough to
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@ -366,17 +366,16 @@ static unsigned GetAutoSenseRadix(StringRef &Str) {
    return 8;
  }

-  if (Str.startswith("0"))
+  if (Str[0] == '0' && Str.size() > 1 && ascii_isdigit(Str[1])) {
+    Str = Str.substr(1);
    return 8;
-  
+  }
+
  return 10;
 }

-
-/// GetAsUnsignedInteger - Workhorse method that converts a integer character
-/// sequence of radix up to 36 to an unsigned long long value.
-bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
-                                unsigned long long &Result) {
+bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
+                                  unsigned long long &Result) {
  // Autosense radix if not specified.
  if (Radix == 0)
    Radix = GetAutoSenseRadix(Str);
@ -385,44 +384,51 @@ bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
  if (Str.empty()) return true;

  // Parse all the bytes of the string given this radix.  Watch for overflow.
+  StringRef Str2 = Str;
  Result = 0;
-  while (!Str.empty()) {
+  while (!Str2.empty()) {
    unsigned CharVal;
-    if (Str[0] >= '0' && Str[0] <= '9')
-      CharVal = Str[0]-'0';
-    else if (Str[0] >= 'a' && Str[0] <= 'z')
-      CharVal = Str[0]-'a'+10;
-    else if (Str[0] >= 'A' && Str[0] <= 'Z')
-      CharVal = Str[0]-'A'+10;
+    if (Str2[0] >= '0' && Str2[0] <= '9')
+      CharVal = Str2[0] - '0';
+    else if (Str2[0] >= 'a' && Str2[0] <= 'z')
+      CharVal = Str2[0] - 'a' + 10;
+    else if (Str2[0] >= 'A' && Str2[0] <= 'Z')
+      CharVal = Str2[0] - 'A' + 10;
    else
-      return true;
+      break;

-    // If the parsed value is larger than the integer radix, the string is
-    // invalid.
+    // If the parsed value is larger than the integer radix, we cannot
+    // consume any more characters.
    if (CharVal >= Radix)
-      return true;
+      break;

    // Add in this character.
    unsigned long long PrevResult = Result;
-    Result = Result*Radix+CharVal;
+    Result = Result * Radix + CharVal;

    // Check for overflow by shifting back and seeing if bits were lost.
-    if (Result/Radix < PrevResult)
+    if (Result / Radix < PrevResult)
      return true;

-    Str = Str.substr(1);
+    Str2 = Str2.substr(1);
  }

+  // We consider the operation a failure if no characters were consumed
+  // successfully.
+  if (Str.size() == Str2.size())
+    return true;
+
+  Str = Str2;
  return false;
 }

-bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
-                              long long &Result) {
+bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix,
+                                long long &Result) {
  unsigned long long ULLVal;

  // Handle positive strings first.
  if (Str.empty() || Str.front() != '-') {
-    if (getAsUnsignedInteger(Str, Radix, ULLVal) ||
+    if (consumeUnsignedInteger(Str, Radix, ULLVal) ||
        // Check for value so large it overflows a signed value.
        (long long)ULLVal < 0)
      return true;
@ -431,17 +437,41 @@ bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
  }

  // Get the positive part of the value.
-  if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) ||
+  StringRef Str2 = Str.drop_front(1);
+  if (consumeUnsignedInteger(Str2, Radix, ULLVal) ||
      // Reject values so large they'd overflow as negative signed, but allow
      // "-0".  This negates the unsigned so that the negative isn't undefined
      // on signed overflow.
      (long long)-ULLVal > 0)
    return true;

+  Str = Str2;
  Result = -ULLVal;
  return false;
 }

+/// GetAsUnsignedInteger - Workhorse method that converts a integer character
+/// sequence of radix up to 36 to an unsigned long long value.
+bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
+                                unsigned long long &Result) {
+  if (consumeUnsignedInteger(Str, Radix, Result))
+    return true;
+
+  // For getAsUnsignedInteger, we require the whole string to be consumed or
+  // else we consider it a failure.
+  return !Str.empty();
+}
+
+bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
+                              long long &Result) {
+  if (consumeSignedInteger(Str, Radix, Result))
+    return true;
+
+  // For getAsSignedInteger, we require the whole string to be consumed or else
+  // we consider it a failure.
+  return !Str.empty();
+}
+
 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
  StringRef Str = *this;

--- a/unittests/ADT/StringRefTest.cpp
+++ b/unittests/ADT/StringRefTest.cpp
@ -590,6 +590,183 @@ TEST(StringRefTest, getAsUnsignedIntegerBadStrings) {
  }
 }

+struct ConsumeUnsignedPair {
+  const char *Str;
+  uint64_t Expected;
+  const char *Leftover;
+} ConsumeUnsigned[] = {
+    {"0", 0, ""},
+    {"255", 255, ""},
+    {"256", 256, ""},
+    {"65535", 65535, ""},
+    {"65536", 65536, ""},
+    {"4294967295", 4294967295ULL, ""},
+    {"4294967296", 4294967296ULL, ""},
+    {"255A376", 255, "A376"},
+    {"18446744073709551615", 18446744073709551615ULL, ""},
+    {"18446744073709551615ABC", 18446744073709551615ULL, "ABC"},
+    {"042", 34, ""},
+    {"0x42", 66, ""},
+    {"0x42-0x34", 66, "-0x34"},
+    {"0b101010", 42, ""},
+    {"0429F", 042, "9F"},            // Auto-sensed octal radix, invalid digit
+    {"0x42G12", 0x42, "G12"},        // Auto-sensed hex radix, invalid digit
+    {"0b10101020101", 42, "20101"}}; // Auto-sensed binary radix, invalid digit.
+
+struct ConsumeSignedPair {
+  const char *Str;
+  int64_t Expected;
+  const char *Leftover;
+} ConsumeSigned[] = {
+    {"0", 0, ""},
+    {"-0", 0, ""},
+    {"0-1", 0, "-1"},
+    {"-0-1", 0, "-1"},
+    {"127", 127, ""},
+    {"128", 128, ""},
+    {"127-1", 127, "-1"},
+    {"128-1", 128, "-1"},
+    {"-128", -128, ""},
+    {"-129", -129, ""},
+    {"-128-1", -128, "-1"},
+    {"-129-1", -129, "-1"},
+    {"32767", 32767, ""},
+    {"32768", 32768, ""},
+    {"32767-1", 32767, "-1"},
+    {"32768-1", 32768, "-1"},
+    {"-32768", -32768, ""},
+    {"-32769", -32769, ""},
+    {"-32768-1", -32768, "-1"},
+    {"-32769-1", -32769, "-1"},
+    {"2147483647", 2147483647LL, ""},
+    {"2147483648", 2147483648LL, ""},
+    {"2147483647-1", 2147483647LL, "-1"},
+    {"2147483648-1", 2147483648LL, "-1"},
+    {"-2147483648", -2147483648LL, ""},
+    {"-2147483649", -2147483649LL, ""},
+    {"-2147483648-1", -2147483648LL, "-1"},
+    {"-2147483649-1", -2147483649LL, "-1"},
+    {"-9223372036854775808", -(9223372036854775807LL) - 1, ""},
+    {"-9223372036854775808-1", -(9223372036854775807LL) - 1, "-1"},
+    {"042", 34, ""},
+    {"042-1", 34, "-1"},
+    {"0x42", 66, ""},
+    {"0x42-1", 66, "-1"},
+    {"0b101010", 42, ""},
+    {"0b101010-1", 42, "-1"},
+    {"-042", -34, ""},
+    {"-042-1", -34, "-1"},
+    {"-0x42", -66, ""},
+    {"-0x42-1", -66, "-1"},
+    {"-0b101010", -42, ""},
+    {"-0b101010-1", -42, "-1"}};
+
+TEST(StringRefTest, consumeIntegerUnsigned) {
+  uint8_t U8;
+  uint16_t U16;
+  uint32_t U32;
+  uint64_t U64;
+
+  for (size_t i = 0; i < array_lengthof(ConsumeUnsigned); ++i) {
+    StringRef Str = ConsumeUnsigned[i].Str;
+    bool U8Success = Str.consumeInteger(0, U8);
+    if (static_cast<uint8_t>(ConsumeUnsigned[i].Expected) ==
+        ConsumeUnsigned[i].Expected) {
+      ASSERT_FALSE(U8Success);
+      EXPECT_EQ(U8, ConsumeUnsigned[i].Expected);
+      EXPECT_EQ(Str, ConsumeUnsigned[i].Leftover);
+    } else {
+      ASSERT_TRUE(U8Success);
+    }
+
+    Str = ConsumeUnsigned[i].Str;
+    bool U16Success = Str.consumeInteger(0, U16);
+    if (static_cast<uint16_t>(ConsumeUnsigned[i].Expected) ==
+        ConsumeUnsigned[i].Expected) {
+      ASSERT_FALSE(U16Success);
+      EXPECT_EQ(U16, ConsumeUnsigned[i].Expected);
+      EXPECT_EQ(Str, ConsumeUnsigned[i].Leftover);
+    } else {
+      ASSERT_TRUE(U16Success);
+    }
+
+    Str = ConsumeUnsigned[i].Str;
+    bool U32Success = Str.consumeInteger(0, U32);
+    if (static_cast<uint32_t>(ConsumeUnsigned[i].Expected) ==
+        ConsumeUnsigned[i].Expected) {
+      ASSERT_FALSE(U32Success);
+      EXPECT_EQ(U32, ConsumeUnsigned[i].Expected);
+      EXPECT_EQ(Str, ConsumeUnsigned[i].Leftover);
+    } else {
+      ASSERT_TRUE(U32Success);
+    }
+
+    Str = ConsumeUnsigned[i].Str;
+    bool U64Success = Str.consumeInteger(0, U64);
+    if (static_cast<uint64_t>(ConsumeUnsigned[i].Expected) ==
+        ConsumeUnsigned[i].Expected) {
+      ASSERT_FALSE(U64Success);
+      EXPECT_EQ(U64, ConsumeUnsigned[i].Expected);
+      EXPECT_EQ(Str, ConsumeUnsigned[i].Leftover);
+    } else {
+      ASSERT_TRUE(U64Success);
+    }
+  }
+}
+
+TEST(StringRefTest, consumeIntegerSigned) {
+  int8_t S8;
+  int16_t S16;
+  int32_t S32;
+  int64_t S64;
+
+  for (size_t i = 0; i < array_lengthof(ConsumeSigned); ++i) {
+    StringRef Str = ConsumeSigned[i].Str;
+    bool S8Success = Str.consumeInteger(0, S8);
+    if (static_cast<int8_t>(ConsumeSigned[i].Expected) ==
+        ConsumeSigned[i].Expected) {
+      ASSERT_FALSE(S8Success);
+      EXPECT_EQ(S8, ConsumeSigned[i].Expected);
+      EXPECT_EQ(Str, ConsumeSigned[i].Leftover);
+    } else {
+      ASSERT_TRUE(S8Success);
+    }
+
+    Str = ConsumeSigned[i].Str;
+    bool S16Success = Str.consumeInteger(0, S16);
+    if (static_cast<int16_t>(ConsumeSigned[i].Expected) ==
+        ConsumeSigned[i].Expected) {
+      ASSERT_FALSE(S16Success);
+      EXPECT_EQ(S16, ConsumeSigned[i].Expected);
+      EXPECT_EQ(Str, ConsumeSigned[i].Leftover);
+    } else {
+      ASSERT_TRUE(S16Success);
+    }
+
+    Str = ConsumeSigned[i].Str;
+    bool S32Success = Str.consumeInteger(0, S32);
+    if (static_cast<int32_t>(ConsumeSigned[i].Expected) ==
+        ConsumeSigned[i].Expected) {
+      ASSERT_FALSE(S32Success);
+      EXPECT_EQ(S32, ConsumeSigned[i].Expected);
+      EXPECT_EQ(Str, ConsumeSigned[i].Leftover);
+    } else {
+      ASSERT_TRUE(S32Success);
+    }
+
+    Str = ConsumeSigned[i].Str;
+    bool S64Success = Str.consumeInteger(0, S64);
+    if (static_cast<int64_t>(ConsumeSigned[i].Expected) ==
+        ConsumeSigned[i].Expected) {
+      ASSERT_FALSE(S64Success);
+      EXPECT_EQ(S64, ConsumeSigned[i].Expected);
+      EXPECT_EQ(Str, ConsumeSigned[i].Leftover);
+    } else {
+      ASSERT_TRUE(S64Success);
+    }
+  }
+}
+
 static const char *join_input[] = { "a", "b", "c" };
 static const char join_result1[] = "a";
 static const char join_result2[] = "a:b:c";