Skip to content
This repository was archived by the owner on Oct 12, 2022. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
847140e
memutils: Replacement of libc string.h functions - currently only Dme…
baziotis Jul 4, 2019
f991173
Style fix
baziotis Jul 4, 2019
ea2ce59
Versioning fix
baziotis Jul 4, 2019
bac120f
Versioning fix vol. 2
baziotis Jul 4, 2019
ff7e755
Independency of std.traits
baziotis Jul 4, 2019
497e53f
Minor fixes/changes
baziotis Jul 5, 2019
c52c099
Style and layout changes
baziotis Jul 5, 2019
6ebec4b
Moved tests to test folder
baziotis Jul 5, 2019
57552ed
More naming and style changes
baziotis Jul 5, 2019
60b3967
Minor fix
baziotis Jul 5, 2019
4faa8f8
Versioning improvement
baziotis Jul 5, 2019
a161b98
Move std.traits code to core.internal.traits
baziotis Jul 5, 2019
5da39a9
Naming fix
baziotis Jul 5, 2019
cc6d019
Fix in using non-existent code in internal.traits unittests
baziotis Jul 5, 2019
7b9eb3c
Fix for uint vs ubyte in memsetNaive
baziotis Jul 5, 2019
08d044f
Removed escaping from tests in memutils
baziotis Jul 6, 2019
d611a18
Versioning improvement
baziotis Jul 7, 2019
00ca80a
GDC SIMD version and bug fix
baziotis Jul 7, 2019
9ad8f16
Not so naive version of memsetNaive
baziotis Jul 7, 2019
504fc7b
mixin removal in memsetNaive
baziotis Jul 29, 2019
9af240f
Style fix
baziotis Jul 29, 2019
08ffa2c
Doc fix
baziotis Jul 30, 2019
ff81219
SIMD versioning improvement
baziotis Jul 30, 2019
4e6654b
Doc improvement
baziotis Jul 30, 2019
d7b8a0b
Doc improvement 2
baziotis Jul 30, 2019
83541f7
Minor changes
baziotis Jul 30, 2019
b9bc30c
Changed Returns to N.B.
baziotis Jul 30, 2019
1204a8b
Add test for empty array
baziotis Aug 3, 2019
a4c7a8d
Added @nogc nothrow
baziotis Aug 3, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mak/COPY
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ COPY=\
$(IMPDIR)\core\time.d \
$(IMPDIR)\core\vararg.d \
\
$(IMPDIR)\core\experimental\memutils.d \
\
$(IMPDIR)\core\internal\abort.d \
$(IMPDIR)\core\internal\arrayop.d \
$(IMPDIR)\core\internal\convert.d \
Expand Down
2 changes: 2 additions & 0 deletions mak/DOCS
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ DOCS=\
$(DOCDIR)\core_gc_config.html \
$(DOCDIR)\core_gc_gcinterface.html \
$(DOCDIR)\core_gc_registry.html \
\
$(DOCDIR)\core_experimental_memutils.html \
\
$(DOCDIR)\core_stdc_assert_.html \
$(DOCDIR)\core_stdc_config.html \
Expand Down
2 changes: 2 additions & 0 deletions mak/SRCS
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ SRCS=\
src\core\thread.d \
src\core\time.d \
src\core\vararg.d \
\
src\core\experimental\memutils.d \
\
src\core\gc\config.d \
src\core\gc\gcinterface.d \
Expand Down
3 changes: 3 additions & 0 deletions mak/WINDOWS
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ $(IMPDIR)\core\gc\gcinterface.d : src\core\gc\gcinterface.d
$(IMPDIR)\core\gc\registry.d : src\core\gc\registry.d
copy $** $@

$(IMPDIR)\core\experimental\memutils.d : src\core\experimental\memutils.d
copy $** $@

$(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d
copy $** $@

Expand Down
256 changes: 256 additions & 0 deletions src/core/experimental/memutils.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
/**
* Pure D replacement of the C Standard Library basic memory building blocks of string.h
* Source: $(DRUNTIMESRC core/experimental/memutils.d)
*/
module core.experimental.memutils;

/**
* If T is an array, set all `dst`'s bytes
* (whose count is the length of the array times
* the size of the array element) to `val`.
* Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
*
* Params
* val = The byte with which we want to fill memory with.
* dst = Memory Destination whose bytes are to be set to `val`.
*
* Returns:
* Nothing.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can just leave the Returns section out if it's void. 😉

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haha, I didn't like it too. But I wanted a way to make a note that this memset is different from the libc in that it returns nothing. I'll add it as a N.B..

*/
void memset(T)(ref T dst, const ubyte val)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this undocumented (ddoc) on purpose?

Copy link
Copy Markdown
Contributor Author

@baziotis baziotis Jul 30, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean it should be:

/**
 * ...
 */

?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, and a Params section (https://dlang.org/spec/ddoc.html#standard_sections) is probably best too.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, thanks. I'm not very accustomed yet to the logistics of contributing to D.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably you can add scope to the reference? Explicit nogc nothrow may be good too.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I'm not very familiar with scope but nogc, nothrow is good.

{
Comment thread
baziotis marked this conversation as resolved.
import core.internal.traits : isArray;
const uint v = cast(uint) val;
static if (isArray!T)
{
size_t n = dst.length * typeof(dst[0]).sizeof;
Dmemset(dst.ptr, v, n);
}
else
{
Dmemset(&dst, v, T.sizeof);
}
}

version (D_SIMD)
{
import core.simd : float4;
enum useSIMD = true;
}
else version (LDC)
{
// LDC always supports SIMD (but doesn't ever set D_SIMD) and
// the back-end uses the most appropriate size for every target.
import core.simd : float4;
enum useSIMD = true;
}
else version (GNU)
{
import core.simd : float4;
// GNU does not support SIMD by default.
version (X86_64)
{
private enum isX86 = true;
}
else version (X86)
{
private enum isX86 = true;
}

static if (isX86 && __traits(compiles, int4))
{
enum useSIMD = true;
}
else
{
enum useSIMD = false;
}
}

version (useSIMD)
{
/* SIMD implementation
*/
private void Dmemset(void *d, const uint val, size_t n)
{
import core.simd : int4;
version (LDC)
{
import ldc.simd : loadUnaligned, storeUnaligned;
void store16i_sse(void *dest, int4 reg)
{
storeUnaligned!int4(reg, cast(int*) dest);
}
}
else version (DigitalMars)
{
import core.simd : void16, loadUnaligned, storeUnaligned;
void store16i_sse(void *dest, int4 reg)
{
storeUnaligned(cast(void16*) dest, reg);
}
}
else
{
import gcc.builtins;
import core.simd : ubyte16;
void store16i_sse(void *dest, int4 reg)
{
__builtin_ia32_storedqu(cast(char*) dest, cast(ubyte16) reg);
}
}
void store32i_sse(void *dest, int4 reg)
{
store16i_sse(dest, reg);
store16i_sse(dest+0x10, reg);
}
// NOTE(stefanos): I use the naive version, which in my benchmarks was slower
// than the previous classic switch. BUT. Using the switch had a significant
// drop in the rest of the sizes. It's not the branch that is responsible for the drop,
// but the fact that it's more difficult to optimize it as part of the rest of the code.
if (n < 32)
{
memsetNaive(d, val, n);
return;
}
void *temp = d + n - 0x10; // Used for the last 32 bytes
const uint v = val * 0x01010101; // Broadcast c to all 4 bytes
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mul is almost never a good idea. I suspect the mul may be a bottleneck in the ~32-128 byte range. If you're gonna do it, why not extend to 8 bytes and get more value from it?

Do you have a reference for this code? It doesn't look very optimal to me. If you're gonna use SSE, there are broadcast and permute functions, which not introduce hazards as bad as mul...

Copy link
Copy Markdown
Contributor Author

@baziotis baziotis Aug 5, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's inspired by Agner Fog. I didn't copy code from him but I have read his optimization manual. This mul trick is his. And no, a mul in x86 is not a problem nowadays.

Our code turned out to be similar e.g. his AVX version: https://github.com/tpn/agner/blob/master/asmlib/asmlibSrc/memset64.asm#L188

It's also inspired by GCC.

Edit: Oh, and there's no point in doing in 8 bytes. It is immediately broadcasted in an XMM register.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm hyper-aware of this mul trick, I've been doing this for decades, but it absolutely IS a problem 'nowdays'... I've written a lot of code using mul-tricks like this, and been surprised when they become an unintuitive bottleneck.
It might not be a problem here, because I think it's usually the case that memset is not called in a hot-loop, but multiplies usually have a lot of limitations on the pipeline.
There's usually only one imul pipeline, and while the latency might not be so bad these days, the circuit is kinda recursive, so it can't accept a new instruction per cycle like add/sub circuits, so if there are multiple mul's contending for the limited imul resource, they create stalling hazards on each other.
imul is often used in address calculation, so there's a reasonable possibility of contention, but there's an if above this, so the pipeline might already have flushed...

I do a lot of work with colour precision scaling using this same trick; the imul is almost always the limiting factor in my loops.

Anyway, I only say this because you're shoving the value straight into SSE in the following line, where you can use much faster permute operations to broadcast the value very easily (like pshufb, or complements on other architectures).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't know all that stuff about muls limiting the pipeline. I just knew that the cycles they consume are about the same as say add / sub. Is there somewhere I can read more about those?

But the probabilities of Agner not having thought this is close to 0. :P

// Broadcast v to all bytes.
auto xmm0 = int4(v);
ubyte rem = cast(ubyte) d & 15; // Remainder from the previous 16-byte boundary.
// Store 16 bytes, from which some will possibly overlap on a future store.
// For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
// add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most
// 16, we store 16 bytes anyway.
store16i_sse(d, xmm0);
d += 16 - rem;
n -= 16 - rem;
// Move in blocks of 32.
if (n >= 32)
{
// Align to (previous) multiple of 32. That does something invisible to the code,
// but a good optimizer will avoid a `cmp` instruction inside the loop. With a
// multiple of 32, the end of the loop can be (if we assume that `n` is in RDX):
// sub RDX, 32;
// jge START_OF_THE_LOOP.
// Without that, it has to be:
// sub RDX, 32;
// cmp RDX, 32;
// jge START_OF_THE_LOOP
// NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means
// we have somehow to compensate for that, which is done at the end of this function.
n &= -32;
do
{
store32i_sse(d, xmm0);
// NOTE(stefanos): I tried avoiding this operation on `d` by combining
// `d` and `n` in the above loop and going backwards. It was slower in my benchs.
d += 32;
n -= 32;
} while (n >= 32);
}
// Compensate for the last (at most) 32 bytes.
store32i_sse(temp-0x10, xmm0);
}
}
else
{
/* Forward to simple implementation.
*/
private void Dmemset(void *d, const uint val, size_t n)
{
memsetNaive(d, val, n);
}
}

/* Naive version for when there isn't any vector support (SIMD etc.).
*/
private void memsetNaive(void *dst, const uint val, size_t n)
{
// NOTE(stefanos): DMD could not inline it.
void handleLT16Sizes(void *d, const ulong v, size_t n)
{
switch (n)
{
case 6:
*(cast(uint*) (d+2)) = cast(uint) v;
goto case 2; // fall-through
case 2:
*(cast(ushort*) d) = cast(ushort) v;
return;

case 7:
*(cast(uint*) (d+3)) = cast(uint) v;
goto case 3; // fall-through
case 3:
*(cast(ushort*) (d+1)) = cast(ushort) v;
goto case 1; // fall-through
case 1:
*(cast(ubyte*) d) = cast(ubyte) v;
return;

case 4:
*(cast(uint*) d) = cast(uint) v;
return;
case 0:
return;

case 5:
*(cast(uint*) (d+1)) = cast(uint) v;
*(cast(ubyte*) d) = cast(ubyte) v;
return;
default:
}
}


const ulong v = cast(ulong) val * 0x0101010101010101; // Broadcast c to all 8 bytes
if (n < 8)
{
handleLT16Sizes(dst, v, n);
return;
}
// NOTE(stefanos): Normally, we would have different alignment
// for 32-bit and 64-bit versions. For the sake of simplicity,
// we'll let the compiler do the work.
ubyte rem = cast(ubyte) dst & 7;
if (rem)
{ // Unaligned
// Move 8 bytes (which we will possibly overlap later).
*(cast(ulong*) dst) = v;
dst += 8 - rem;
n -= 8 - rem;
}
ulong *d = cast(ulong*) dst;
ulong temp = n / 8;
// Go in steps of 8 - the register size in x86_64.
for (size_t i = 0; i != temp; ++i)
{
*d = v;
++d;
n -= 8;
}
dst = cast(void *) d;

handleLT16Sizes(dst, v, n);
}


/** Core features tests.
*/
unittest
{
ubyte[3] a;
memset(a, 7);
assert(a[0] == 7);
assert(a[1] == 7);
assert(a[2] == 7);

real b;
memset(b, 9);
ubyte *p = cast(ubyte*) &b;
foreach (i; 0 .. b.sizeof)
{
assert(p[i] == 9);
}
}
Loading