dlang · baziotis · Jul 4, 2019 · Jul 4, 2019 · Jul 4, 2019 · Jul 4, 2019
diff --git a/mak/COPY b/mak/COPY
@@ -21,6 +21,8 @@ COPY=\
 	$(IMPDIR)\core\time.d \
 	$(IMPDIR)\core\vararg.d \
 	\
+	$(IMPDIR)\core\experimental\memutils.d \
+    \
 	$(IMPDIR)\core\internal\abort.d \
 	$(IMPDIR)\core\internal\arrayop.d \
 	$(IMPDIR)\core\internal\convert.d \

diff --git a/mak/DOCS b/mak/DOCS
@@ -19,6 +19,8 @@ DOCS=\
 	$(DOCDIR)\core_gc_config.html \
 	$(DOCDIR)\core_gc_gcinterface.html \
 	$(DOCDIR)\core_gc_registry.html \
+    \
+	$(DOCDIR)\core_experimental_memutils.html \
 	\
 	$(DOCDIR)\core_stdc_assert_.html \
 	$(DOCDIR)\core_stdc_config.html \

diff --git a/mak/SRCS b/mak/SRCS
@@ -16,6 +16,8 @@ SRCS=\
 	src\core\thread.d \
 	src\core\time.d \
 	src\core\vararg.d \
+    \
+	src\core\experimental\memutils.d \
 	\
 	src\core\gc\config.d \
 	src\core\gc\gcinterface.d \

diff --git a/mak/WINDOWS b/mak/WINDOWS
@@ -116,6 +116,9 @@ $(IMPDIR)\core\gc\gcinterface.d : src\core\gc\gcinterface.d
 $(IMPDIR)\core\gc\registry.d : src\core\gc\registry.d
 	copy $** $@
 
+$(IMPDIR)\core\experimental\memutils.d : src\core\experimental\memutils.d
+	copy $** $@
+
 $(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d
 	copy $** $@
 

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
@@ -0,0 +1,179 @@
+/**
+ * Pure D replacement of the C Standard Library basic memory building blocks of string.h
+ * Source: $(DRUNTIMESRC core/experimental/memutils.d)
+ */
+module core.experimental.memutils;
+
+/** memset() implementation */
+
+/**
+ * NOTE(stefanos):
+ * Range-checking is not needed since the user never
+ * pass an `n` (byte count) directly.
+ */
+
+/*
+  If T is an array,set all `dst`'s bytes
+  (whose count is the length of the array times
+  the size of the array element) to `val`.
+  Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
+*/
+
+void memset(T)(ref T dst, const ubyte val)
+{
+    import core.internal.traits : isArray;
+    const uint v = cast(uint) val;
+    static if (isArray!T)
+    {
+        size_t n = dst.length * typeof(dst[0]).sizeof;
+        Dmemset(dst.ptr, v, n);
+    }
+    else
+    {
+        Dmemset(&dst, v, T.sizeof);
+    }
+}
+
+version (GNU)
+{
+    private void Dmemset(void *d, const uint val, size_t n)
+    {
+        memsetNaive(d, val, n);
+    }
+}
+else
+version (D_SIMD)
+{
+    /* SIMD implementation
+     */
+    private void Dmemset(void *d, const uint val, size_t n)
+    {
+        import core.simd : int4;
+        version (LDC)
+        {
+            import ldc.simd : loadUnaligned, storeUnaligned;
+        }
+        else version (DigitalMars)
+        {
+            import core.simd : void16, loadUnaligned, storeUnaligned;
+        }
+        else
+        {
+            static assert(0, "Only DMD / LDC are supported");
+        }
+        // TODO(stefanos): Is there a way to make them @safe?
+        // (The problem is that for LDC, they could take int* or float* pointers
+        // but the cast to void16 for DMD is necessary anyway).
+        void store32i_sse(void *dest, int4 reg)
+        {
+            version (LDC)
+            {
+                storeUnaligned!int4(reg, cast(int*) dest);
+                storeUnaligned!int4(reg, cast(int*) (dest+0x10));
+            }
+            else
+            {
+                storeUnaligned(cast(void16*) dest, reg);
+                storeUnaligned(cast(void16*) (dest+0x10), reg);
+            }
+        }
+        void store16i_sse(void *dest, int4 reg)
+        {
+            version (LDC)
+            {
+                storeUnaligned!int4(reg, cast(int*) dest);
+            }
+            else
+            {
+                storeUnaligned(cast(void16*) dest, reg);
+            }
+        }
+        const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
+        // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
+        // than the previous classic switch. BUT. Using the switch had a significant
+        // drop in the rest of the sizes. It's not the branch that is responsible for the drop,
+        // but the fact that it's more difficult to optimize it as part of the rest of the code.
+        if (n <= 16)
+        {
+            memsetNaive(cast(ubyte*) d, cast(ubyte) val, n);
+            return;
+        }
+        void *temp = d + n - 0x10;                  // Used for the last 32 bytes
+        // Broadcast v to all bytes.
+        auto xmm0 = int4(v);
+        ubyte rem = cast(ubyte) d & 15;              // Remainder from the previous 16-byte boundary.
+        // Store 16 bytes, from which some will possibly overlap on a future store.
+        // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
+        // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most
+        // 16, we store 16 bytes anyway.
+        store16i_sse(d, xmm0);
+        d += 16 - rem;
+        n -= 16 - rem;
+        // Move in blocks of 32.
+        // TODO(stefanos): Experiment with differnt sizes.
+        if (n >= 32)
+        {
+            // Align to (previous) multiple of 32. That does something invisible to the code,
+            // but a good optimizer will avoid a `cmp` instruction inside the loop. With a
+            // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX):
+            // sub RDX, 32;
+            // jge START_OF_THE_LOOP.
+            // Without that, it has to be:
+            // sub RDX, 32;
+            // cmp RDX, 32;
+            // jge START_OF_THE_LOOP
+            // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means
+            // we have somehow to compensate for that, which is done at the end of this function.
+            n &= -32;
+            do
+            {
+                store32i_sse(d, xmm0);
+                // NOTE(stefanos): I tried avoiding this operation on `d` by combining
+                // `d` and `n` in the above loop and going backwards. It was slower in my benchs.
+                d += 32;
+                n -= 32;
+            } while (n >= 32);
+        }
+        // Compensate for the last (at most) 32 bytes.
+        store32i_sse(temp-0x10, xmm0);
+    }
+
+}
+else
+{
+    private void Dmemset(void *d, const uint val, size_t n)
+    {
+        memsetNaive(d, val, n);
+    }
+}
+
+/* Naive implementation
+ */
+private void memsetNaive(void *dst, const uint val, size_t n)
+{
+    ubyte *d = cast(ubyte*) dst;
+    foreach (i; 0 .. n)
+    {
+        d[i] = cast(ubyte)val;
+    }
+}
+
+
+/** Core features tests.
+  */
+unittest
+{
+    ubyte[3] a;
+    memset(a, 7);
+    assert(a[0] == 7);
+    assert(a[1] == 7);
+    assert(a[2] == 7);
+
+    real b;
+    memset(b, 9);
+    ubyte *p = cast(ubyte*) &b;
+    foreach (i; 0 .. b.sizeof)
+    {
+        assert(p[i] == 9);
+    }
+}
diff --git a/src/core/internal/traits.d b/src/core/internal/traits.d
@@ -567,3 +567,117 @@ if (func.length == 1 /*&& isCallable!func*/)
     static assert(P_dglit.length == 1);
     static assert(is(P_dglit[0] == int));
 }
+
+// [For internal use]
+package template ModifyTypePreservingTQ(alias Modifier, T)
+{
+         static if (is(T U ==          immutable U)) alias ModifyTypePreservingTQ =          immutable Modifier!U;
+    else static if (is(T U == shared inout const U)) alias ModifyTypePreservingTQ = shared inout const Modifier!U;
+    else static if (is(T U == shared inout       U)) alias ModifyTypePreservingTQ = shared inout       Modifier!U;
+    else static if (is(T U == shared       const U)) alias ModifyTypePreservingTQ = shared       const Modifier!U;
+    else static if (is(T U == shared             U)) alias ModifyTypePreservingTQ = shared             Modifier!U;
+    else static if (is(T U ==        inout const U)) alias ModifyTypePreservingTQ =        inout const Modifier!U;
+    else static if (is(T U ==        inout       U)) alias ModifyTypePreservingTQ =              inout Modifier!U;
+    else static if (is(T U ==              const U)) alias ModifyTypePreservingTQ =              const Modifier!U;
+    else                                             alias ModifyTypePreservingTQ =                    Modifier!T;
+}
+
+@safe unittest
+{
+    alias Intify(T) = int;
+    static assert(is(ModifyTypePreservingTQ!(Intify,                    real) ==                    int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,              const real) ==              const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,        inout       real) ==        inout       int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,        inout const real) ==        inout const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared             real) == shared             int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared       const real) == shared       const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared inout       real) == shared inout       int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared inout const real) == shared inout const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,          immutable real) ==          immutable int));
+}
+
+/**
+ * Strips off all `enum`s from type `T`.
+ */
+template OriginalType(T)
+{
+    template Impl(T)
+    {
+        static if (is(T U == enum)) alias Impl = OriginalType!U;
+        else                        alias Impl =              T;
+    }
+
+    alias OriginalType = ModifyTypePreservingTQ!(Impl, T);
+}
+
+///
+@safe unittest
+{
+    enum E : real { a = 0 } // NOTE: explicit initialization to 0 required during Enum init deprecation cycle
+    enum F : E    { a = E.a }
+    alias G = const(F);
+    static assert(is(OriginalType!E == real));
+    static assert(is(OriginalType!F == real));
+    static assert(is(OriginalType!G == const real));
+}
+
+/**
+ * Detect whether type `T` is an aggregate type.
+ */
+enum bool isAggregateType(T) = is(T == struct) || is(T == union) ||
+                               is(T == class) || is(T == interface);
+
+private template AliasThisTypeOf(T)
+if (isAggregateType!T)
+{
+    alias members = __traits(getAliasThis, T);
+
+    static if (members.length == 1)
+    {
+        alias AliasThisTypeOf = typeof(__traits(getMember, T.init, members[0]));
+    }
+    else
+        static assert(0, T.stringof~" does not have alias this type");
+}
+
+/*
+ */
+template DynamicArrayTypeOf(T)
+{
+    static if (is(AliasThisTypeOf!T AT) && !is(AT[] == AT))
+        alias X = DynamicArrayTypeOf!AT;
+    else
+        alias X = OriginalType!T;
+
+    static if (is(Unqual!X : E[], E) && !is(typeof({ enum n = X.length; })))
+    {
+        alias DynamicArrayTypeOf = X;
+    }
+    else
+        static assert(0, T.stringof~" is not a dynamic array");
+}
+
+// TODO(stefanos): More unit-testing.
+
+@safe unittest
+{
+    static assert(!is(DynamicArrayTypeOf!(int[3])));
+    static assert(!is(DynamicArrayTypeOf!(void[3])));
+    static assert(!is(DynamicArrayTypeOf!(typeof(null))));
+}
+
+/**
+ * Detect whether type `T` is a dynamic array.
+ */
+enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T;
+
+/**
+ * Detect whether type `T` is an array (static or dynamic; for associative
+ *  arrays see $(LREF isAssociativeArray)).
+ */
+enum bool isArray(T) = isStaticArray!T || isDynamicArray!T;
+
+/**
+ * Detect whether type `T` is a static array.
+ */
+enum bool isStaticArray(T) = __traits(isStaticArray, T);
diff --git a/test/experimental/Makefile b/test/experimental/Makefile
@@ -0,0 +1,17 @@
+include ../common.mak
+
+TESTS:=memutils
+
+.PHONY: all clean
+all: $(addprefix $(ROOT)/,$(addsuffix .done,$(TESTS)))
+
+$(ROOT)/%.done: $(ROOT)/%
+	@echo Testing $*
+	$(QUIET)$(TIMELIMIT)$(ROOT)/$* $(RUN_ARGS)
+	@touch $@
+
+$(ROOT)/%: $(SRC)/%.d
+	$(QUIET)$(DMD) $(DFLAGS) -of$@ $<
+
+clean:
+	rm -rf $(ROOT)