Merge pull request #12448 from r00ster91/ultimateascii

std.ascii: rename functions and other improvements
2026-02-20 00:08:56 +00:00 · 2022-10-15 14:28:33 -04:00 · 2022-10-15 14:28:33 -04:00 · c0d7f64036
commit c0d7f64036
parent f9192adaba 4ea3a9ba9f
1 changed files with 210 additions and 31 deletions
--- a/lib/std/ascii.zig
+++ b/lib/std/ascii.zig
@ -1,54 +1,164 @@
-// Does NOT look at the locale the way C89's toupper(3), isspace() et cetera does.
-// I could have taken only a u7 to make this clear, but it would be slower
-// It is my opinion that encodings other than UTF-8 should not be supported.
-//
-// (and 128 bytes is not much to pay).
-// Also does not handle Unicode character classes.
-//
-// https://upload.wikimedia.org/wikipedia/commons/thumb/c/cf/USASCII_code_chart.png/1200px-USASCII_code_chart.png
+//! The 7-bit [ASCII](https://en.wikipedia.org/wiki/ASCII) character encoding standard.
+//!
+//! This is not to be confused with the 8-bit [extended ASCII](https://en.wikipedia.org/wiki/Extended_ASCII) character encoding.
+//!
+//! Even though this module concerns itself with 7-bit ASCII,
+//! functions use `u8` as the type instead of `u7` for convenience and compatibility.
+//! Characters outside of the 7-bit range are gracefully handled (e.g. by returning `false`).
+//!
+//! See also: https://en.wikipedia.org/wiki/ASCII#Character_set

 const std = @import("std");

-/// Contains constants for the C0 control codes of the ASCII encoding.
-/// https://en.wikipedia.org/wiki/C0_and_C1_control_codes
+// TODO: remove all decls marked as DEPRECATED after 0.10.0's release
+
+/// The C0 control codes of the ASCII encoding.
+///
+/// See also: https://en.wikipedia.org/wiki/C0_and_C1_control_codes and `isControl`.
 pub const control_code = struct {
+    // DEPRECATED: use the lowercase variant
    pub const NUL = 0x00;
+    // DEPRECATED: use the lowercase variant
    pub const SOH = 0x01;
+    // DEPRECATED: use the lowercase variant
    pub const STX = 0x02;
+    // DEPRECATED: use the lowercase variant
    pub const ETX = 0x03;
+    // DEPRECATED: use the lowercase variant
    pub const EOT = 0x04;
+    // DEPRECATED: use the lowercase variant
    pub const ENQ = 0x05;
+    // DEPRECATED: use the lowercase variant
    pub const ACK = 0x06;
+    // DEPRECATED: use the lowercase variant
    pub const BEL = 0x07;
+    // DEPRECATED: use the lowercase variant
    pub const BS = 0x08;
+    // DEPRECATED: use `ht`
    pub const TAB = 0x09;
+    // DEPRECATED: use the lowercase variant
    pub const LF = 0x0A;
+    // DEPRECATED: use the lowercase variant
    pub const VT = 0x0B;
+    // DEPRECATED: use the lowercase variant
    pub const FF = 0x0C;
+    // DEPRECATED: use the lowercase variant
    pub const CR = 0x0D;
+    // DEPRECATED: use the lowercase variant
    pub const SO = 0x0E;
+    // DEPRECATED: use the lowercase variant
    pub const SI = 0x0F;
+    // DEPRECATED: use the lowercase variant
    pub const DLE = 0x10;
+    // DEPRECATED: use the lowercase variant
    pub const DC1 = 0x11;
+    // DEPRECATED: use the lowercase variant
    pub const DC2 = 0x12;
+    // DEPRECATED: use the lowercase variant
    pub const DC3 = 0x13;
+    // DEPRECATED: use the lowercase variant
    pub const DC4 = 0x14;
+    // DEPRECATED: use the lowercase variant
    pub const NAK = 0x15;
+    // DEPRECATED: use the lowercase variant
    pub const SYN = 0x16;
+    // DEPRECATED: use the lowercase variant
    pub const ETB = 0x17;
+    // DEPRECATED: use the lowercase variant
    pub const CAN = 0x18;
+    // DEPRECATED: use the lowercase variant
    pub const EM = 0x19;
+    // DEPRECATED: use the lowercase variant
    pub const SUB = 0x1A;
+    // DEPRECATED: use the lowercase variant
    pub const ESC = 0x1B;
+    // DEPRECATED: use the lowercase variant
    pub const FS = 0x1C;
+    // DEPRECATED: use the lowercase variant
    pub const GS = 0x1D;
+    // DEPRECATED: use the lowercase variant
    pub const RS = 0x1E;
+    // DEPRECATED: use the lowercase variant
    pub const US = 0x1F;
-
+    // DEPRECATED: use the lowercase variant
    pub const DEL = 0x7F;
-
+    // DEPRECATED: use the lowercase variant
    pub const XON = 0x11;
+    // DEPRECATED: use the lowercase variant
    pub const XOFF = 0x13;
+
+    /// Null.
+    pub const nul = 0x00;
+    /// Start of Heading.
+    pub const soh = 0x01;
+    /// Start of Text.
+    pub const stx = 0x02;
+    /// End of Text.
+    pub const etx = 0x03;
+    /// End of Transmission.
+    pub const eot = 0x04;
+    /// Enquiry.
+    pub const enq = 0x05;
+    /// Acknowledge.
+    pub const ack = 0x06;
+    /// Bell, Alert.
+    pub const bel = 0x07;
+    /// Backspace.
+    pub const bs = 0x08;
+    /// Horizontal Tab, Tab ('\t').
+    pub const ht = 0x09;
+    /// Line Feed, Newline ('\n').
+    pub const lf = 0x0A;
+    /// Vertical Tab.
+    pub const vt = 0x0B;
+    /// Form Feed.
+    pub const ff = 0x0C;
+    /// Carriage Return ('\r').
+    pub const cr = 0x0D;
+    /// Shift Out.
+    pub const so = 0x0E;
+    /// Shift In.
+    pub const si = 0x0F;
+    /// Data Link Escape.
+    pub const dle = 0x10;
+    /// Device Control One (XON).
+    pub const dc1 = 0x11;
+    /// Device Control Two.
+    pub const dc2 = 0x12;
+    /// Device Control Three (XOFF).
+    pub const dc3 = 0x13;
+    /// Device Control Four.
+    pub const dc4 = 0x14;
+    /// Negative Acknowledge.
+    pub const nak = 0x15;
+    /// Synchronous Idle.
+    pub const syn = 0x16;
+    /// End of Transmission Block
+    pub const etb = 0x17;
+    /// Cancel.
+    pub const can = 0x18;
+    /// End of Medium.
+    pub const em = 0x19;
+    /// Substitute.
+    pub const sub = 0x1A;
+    /// Escape.
+    pub const esc = 0x1B;
+    /// File Separator.
+    pub const fs = 0x1C;
+    /// Group Separator.
+    pub const gs = 0x1D;
+    /// Record Separator.
+    pub const rs = 0x1E;
+    /// Unit Separator.
+    pub const us = 0x1F;
+
+    /// Delete.
+    pub const del = 0x7F;
+
+    /// An alias to `dc1`.
+    pub const xon = dc1;
+    /// An alias to `dc3`.
+    pub const xoff = dc3;
 };

 const tIndex = enum(u3) {
@ -188,73 +298,106 @@ fn inTable(c: u8, t: tIndex) bool {
    return (combinedTable[c] & (@as(u8, 1) << @enumToInt(t))) != 0;
 }

-pub fn isAlNum(c: u8) bool {
+/// DEPRECATED: use `isAlphanumeric`
+pub const isAlNum = isAlphanumeric;
+/// DEPRECATED: use `isAlpha`
+pub const isAlpha = isAlphabetic;
+/// DEPRECATED: use `isAlpha`
+pub const isCntrl = isControl;
+/// DEPRECATED: use `isWhitespace`.
+pub const isSpace = isWhitespace;
+/// DEPRECATED: use `whitespace`.
+pub const spaces = whitespace;
+/// DEPRECATED: use `isHex`.
+pub const isXDigit = isHex;
+
+/// Returns whether the character is alphanumeric.
+pub fn isAlphanumeric(c: u8) bool {
    return (combinedTable[c] & ((@as(u8, 1) << @enumToInt(tIndex.Alpha)) |
        @as(u8, 1) << @enumToInt(tIndex.Digit))) != 0;
 }

-pub fn isAlpha(c: u8) bool {
+/// Returns whether the character is alphabetic.
+pub fn isAlphabetic(c: u8) bool {
    return inTable(c, tIndex.Alpha);
 }

-pub fn isCntrl(c: u8) bool {
-    return c < 0x20 or c == 127; //DEL
+/// Returns whether the character is a control character.
+/// This is the same as `!isPrint(c)`.
+///
+/// See also: `control_code`.
+pub fn isControl(c: u8) bool {
+    return c <= control_code.us or c == control_code.del;
 }

+/// Returns whether the character is a digit.
 pub fn isDigit(c: u8) bool {
    return inTable(c, tIndex.Digit);
 }

+/// DEPRECATED: use `isPrint(c) and c != ' '` instead
 pub fn isGraph(c: u8) bool {
    return inTable(c, tIndex.Graph);
 }

+/// Returns whether the character is a lowercased letter.
 pub fn isLower(c: u8) bool {
    return inTable(c, tIndex.Lower);
 }

+/// Returns whether the character has some graphical representation and can be printed.
+/// This also returns `true` for the space character.
+/// This is the same as `!isControl(c)`.
 pub fn isPrint(c: u8) bool {
    return inTable(c, tIndex.Graph) or c == ' ';
 }

+/// DEPRECATED: create your own function based on your needs and what you want to do.
 pub fn isPunct(c: u8) bool {
    return inTable(c, tIndex.Punct);
 }

-pub fn isSpace(c: u8) bool {
+/// Returns whether this character is included in `whitespace`.
+pub fn isWhitespace(c: u8) bool {
    return inTable(c, tIndex.Space);
 }

-/// All the values for which isSpace() returns true. This may be used with
-/// e.g. std.mem.trim() to trim whiteSpace.
-pub const spaces = [_]u8{ ' ', '\t', '\n', '\r', control_code.VT, control_code.FF };
+/// Whitespace for general use.
+/// This may be used with e.g. `std.mem.trim` to trim whitespace.
+///
+/// See also: `isWhitespace`.
+pub const whitespace = [_]u8{ ' ', '\t', '\n', '\r', control_code.vt, control_code.ff };

-test "spaces" {
-    const testing = std.testing;
-    for (spaces) |space| try testing.expect(isSpace(space));
+test "whitespace" {
+    for (whitespace) |char| try std.testing.expect(isWhitespace(char));

    var i: u8 = 0;
    while (isASCII(i)) : (i += 1) {
-        if (isSpace(i)) try testing.expect(std.mem.indexOfScalar(u8, &spaces, i) != null);
+        if (isWhitespace(i)) try std.testing.expect(std.mem.indexOfScalar(u8, &whitespace, i) != null);
    }
 }

+/// Returns whether the character is an uppercased letter.
 pub fn isUpper(c: u8) bool {
    return inTable(c, tIndex.Upper);
 }

-pub fn isXDigit(c: u8) bool {
+/// Returns whether the character is a hexadecimal digit. This is case-insensitive.
+pub fn isHex(c: u8) bool {
    return inTable(c, tIndex.Hex);
 }

+/// Returns whether the character is a 7-bit ASCII character.
 pub fn isASCII(c: u8) bool {
    return c < 128;
 }

+/// DEPRECATED: use `c == ' ' or c == '\t'` or try `isWhitespace`
 pub fn isBlank(c: u8) bool {
    return (c == ' ') or (c == '\x09');
 }

+/// Uppercases the character and returns it as-is if it's already uppercased or not a letter.
 pub fn toUpper(c: u8) u8 {
    if (isLower(c)) {
        return c & 0b11011111;
@ -263,6 +406,7 @@ pub fn toUpper(c: u8) u8 {
    }
 }

+/// Lowercases the character and returns it as-is if it's already lowercased or not a letter.
 pub fn toLower(c: u8) u8 {
    if (isUpper(c)) {
        return c | 0b00100000;
@ -274,13 +418,50 @@ pub fn toLower(c: u8) u8 {
 test "ascii character classes" {
    const testing = std.testing;

+    try testing.expect(!isControl('a'));
+    try testing.expect(!isControl('z'));
+    try testing.expect(isControl(control_code.nul));
+    try testing.expect(isControl(control_code.ff));
+    try testing.expect(isControl(control_code.us));
+
    try testing.expect('C' == toUpper('c'));
    try testing.expect(':' == toUpper(':'));
    try testing.expect('\xab' == toUpper('\xab'));
+    try testing.expect(!isUpper('z'));
+
    try testing.expect('c' == toLower('C'));
+    try testing.expect(':' == toLower(':'));
+    try testing.expect('\xab' == toLower('\xab'));
+    try testing.expect(!isLower('Z'));
+
+    try testing.expect(isAlphanumeric('Z'));
+    try testing.expect(isAlphanumeric('z'));
+    try testing.expect(isAlphanumeric('5'));
+    try testing.expect(isAlphanumeric('5'));
+    try testing.expect(!isAlphanumeric('!'));
+
+    try testing.expect(!isAlpha('5'));
    try testing.expect(isAlpha('c'));
    try testing.expect(!isAlpha('5'));
-    try testing.expect(isSpace(' '));
+
+    try testing.expect(isWhitespace(' '));
+    try testing.expect(isWhitespace('\t'));
+    try testing.expect(isWhitespace('\r'));
+    try testing.expect(isWhitespace('\n'));
+    try testing.expect(!isWhitespace('.'));
+
+    try testing.expect(!isHex('g'));
+    try testing.expect(isHex('b'));
+    try testing.expect(isHex('9'));
+
+    try testing.expect(!isDigit('~'));
+    try testing.expect(isDigit('0'));
+    try testing.expect(isDigit('9'));
+
+    try testing.expect(isPrint(' '));
+    try testing.expect(isPrint('@'));
+    try testing.expect(isPrint('~'));
+    try testing.expect(!isPrint(control_code.esc));
 }

 /// Writes a lower case copy of `ascii_string` to `output`.
@ -341,7 +522,7 @@ test "allocUpperString" {
    try std.testing.expectEqualStrings("ABCDEFGHIJKLMNOPQRST0234+💩!", result);
 }

-/// Compares strings `a` and `b` case insensitively and returns whether they are equal.
+/// Compares strings `a` and `b` case-insensitively and returns whether they are equal.
 pub fn eqlIgnoreCase(a: []const u8, b: []const u8) bool {
    if (a.len != b.len) return false;
    for (a) |a_c, i| {
@ -397,11 +578,10 @@ test "indexOfIgnoreCase" {
    try std.testing.expect(indexOfIgnoreCase("one two three FouR", "gOur") == null);
    try std.testing.expect(indexOfIgnoreCase("foO", "Foo").? == 0);
    try std.testing.expect(indexOfIgnoreCase("foo", "fool") == null);
-
    try std.testing.expect(indexOfIgnoreCase("FOO foo", "fOo").? == 0);
 }

-/// Compares two slices of numbers lexicographically. O(n).
+/// Returns the lexicographical order of two slices. O(n).
 pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order {
    const n = std.math.min(lhs.len, rhs.len);
    var i: usize = 0;
@ -415,8 +595,7 @@ pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order {
    return std.math.order(lhs.len, rhs.len);
 }

-/// Returns true if lhs < rhs, false otherwise
-/// TODO rename "IgnoreCase" to "Insensitive" in this entire file.
+/// Returns whether the lexicographical order of `lhs` is lower than `rhs`.
 pub fn lessThanIgnoreCase(lhs: []const u8, rhs: []const u8) bool {
    return orderIgnoreCase(lhs, rhs) == .lt;
 }