-
-
Notifications
You must be signed in to change notification settings - Fork 410
memutils: Replacement of libc string.h functions - currently only Dmemset() #2662
Changes from 16 commits
847140e
f991173
ea2ce59
bac120f
ff7e755
497e53f
c52c099
6ebec4b
57552ed
60b3967
4faa8f8
a161b98
5da39a9
cc6d019
7b9eb3c
08d044f
d611a18
00ca80a
9ad8f16
504fc7b
9af240f
08ffa2c
ff81219
4e6654b
d7b8a0b
83541f7
b9bc30c
1204a8b
a4c7a8d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,179 @@ | ||
| /** | ||
| * Pure D replacement of the C Standard Library basic memory building blocks of string.h | ||
| * Source: $(DRUNTIMESRC core/experimental/memutils.d) | ||
| */ | ||
| module core.experimental.memutils; | ||
|
|
||
| /** memset() implementation */ | ||
|
|
||
| /** | ||
| * NOTE(stefanos): | ||
|
baziotis marked this conversation as resolved.
Outdated
|
||
| * Range-checking is not needed since the user never | ||
| * pass an `n` (byte count) directly. | ||
| */ | ||
|
|
||
| /* | ||
| If T is an array,set all `dst`'s bytes | ||
| (whose count is the length of the array times | ||
| the size of the array element) to `val`. | ||
| Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. | ||
| */ | ||
|
|
||
| void memset(T)(ref T dst, const ubyte val) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this undocumented (ddoc) on purpose?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean it should be: ?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, and a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, thanks. I'm not very accustomed yet to the logistics of contributing to D.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably you can add
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I'm not very familiar with |
||
| { | ||
|
baziotis marked this conversation as resolved.
|
||
| import core.internal.traits : isArray; | ||
| const uint v = cast(uint) val; | ||
| static if (isArray!T) | ||
| { | ||
| size_t n = dst.length * typeof(dst[0]).sizeof; | ||
| Dmemset(dst.ptr, v, n); | ||
| } | ||
| else | ||
| { | ||
| Dmemset(&dst, v, T.sizeof); | ||
| } | ||
| } | ||
|
|
||
| version (GNU) | ||
|
baziotis marked this conversation as resolved.
Outdated
|
||
| { | ||
| private void Dmemset(void *d, const uint val, size_t n) | ||
| { | ||
| memsetNaive(d, val, n); | ||
| } | ||
| } | ||
| else | ||
| version (D_SIMD) | ||
| { | ||
| /* SIMD implementation | ||
| */ | ||
| private void Dmemset(void *d, const uint val, size_t n) | ||
| { | ||
| import core.simd : int4; | ||
| version (LDC) | ||
| { | ||
| import ldc.simd : loadUnaligned, storeUnaligned; | ||
| } | ||
| else version (DigitalMars) | ||
| { | ||
| import core.simd : void16, loadUnaligned, storeUnaligned; | ||
| } | ||
| else | ||
| { | ||
| static assert(0, "Only DMD / LDC are supported"); | ||
| } | ||
| // TODO(stefanos): Is there a way to make them @safe? | ||
| // (The problem is that for LDC, they could take int* or float* pointers | ||
| // but the cast to void16 for DMD is necessary anyway). | ||
| void store32i_sse(void *dest, int4 reg) | ||
| { | ||
| version (LDC) | ||
| { | ||
| storeUnaligned!int4(reg, cast(int*) dest); | ||
| storeUnaligned!int4(reg, cast(int*) (dest+0x10)); | ||
| } | ||
| else | ||
| { | ||
| storeUnaligned(cast(void16*) dest, reg); | ||
| storeUnaligned(cast(void16*) (dest+0x10), reg); | ||
| } | ||
| } | ||
| void store16i_sse(void *dest, int4 reg) | ||
| { | ||
| version (LDC) | ||
| { | ||
| storeUnaligned!int4(reg, cast(int*) dest); | ||
| } | ||
| else | ||
| { | ||
| storeUnaligned(cast(void16*) dest, reg); | ||
| } | ||
| } | ||
| const uint v = val * 0x01010101; // Broadcast c to all 4 bytes | ||
| // NOTE(stefanos): I use the naive version, which in my benchmarks was slower | ||
| // than the previous classic switch. BUT. Using the switch had a significant | ||
| // drop in the rest of the sizes. It's not the branch that is responsible for the drop, | ||
| // but the fact that it's more difficult to optimize it as part of the rest of the code. | ||
| if (n <= 16) | ||
| { | ||
| memsetNaive(cast(ubyte*) d, cast(ubyte) val, n); | ||
| return; | ||
| } | ||
| void *temp = d + n - 0x10; // Used for the last 32 bytes | ||
| // Broadcast v to all bytes. | ||
| auto xmm0 = int4(v); | ||
| ubyte rem = cast(ubyte) d & 15; // Remainder from the previous 16-byte boundary. | ||
| // Store 16 bytes, from which some will possibly overlap on a future store. | ||
| // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned, | ||
| // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most | ||
| // 16, we store 16 bytes anyway. | ||
| store16i_sse(d, xmm0); | ||
| d += 16 - rem; | ||
| n -= 16 - rem; | ||
| // Move in blocks of 32. | ||
| // TODO(stefanos): Experiment with differnt sizes. | ||
| if (n >= 32) | ||
| { | ||
| // Align to (previous) multiple of 32. That does something invisible to the code, | ||
| // but a good optimizer will avoid a `cmp` instruction inside the loop. With a | ||
| // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX): | ||
| // sub RDX, 32; | ||
| // jge START_OF_THE_LOOP. | ||
| // Without that, it has to be: | ||
| // sub RDX, 32; | ||
| // cmp RDX, 32; | ||
| // jge START_OF_THE_LOOP | ||
| // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means | ||
| // we have somehow to compensate for that, which is done at the end of this function. | ||
| n &= -32; | ||
| do | ||
| { | ||
| store32i_sse(d, xmm0); | ||
| // NOTE(stefanos): I tried avoiding this operation on `d` by combining | ||
| // `d` and `n` in the above loop and going backwards. It was slower in my benchs. | ||
| d += 32; | ||
| n -= 32; | ||
| } while (n >= 32); | ||
| } | ||
| // Compensate for the last (at most) 32 bytes. | ||
| store32i_sse(temp-0x10, xmm0); | ||
| } | ||
|
|
||
| } | ||
| else | ||
| { | ||
| private void Dmemset(void *d, const uint val, size_t n) | ||
| { | ||
| memsetNaive(d, val, n); | ||
| } | ||
| } | ||
|
|
||
| /* Naive implementation | ||
| */ | ||
| private void memsetNaive(void *dst, const uint val, size_t n) | ||
| { | ||
| ubyte *d = cast(ubyte*) dst; | ||
| foreach (i; 0 .. n) | ||
| { | ||
| d[i] = cast(ubyte)val; | ||
| } | ||
| } | ||
|
|
||
|
|
||
| /** Core features tests. | ||
| */ | ||
| unittest | ||
| { | ||
| ubyte[3] a; | ||
| memset(a, 7); | ||
| assert(a[0] == 7); | ||
| assert(a[1] == 7); | ||
| assert(a[2] == 7); | ||
|
|
||
| real b; | ||
| memset(b, 9); | ||
| ubyte *p = cast(ubyte*) &b; | ||
| foreach (i; 0 .. b.sizeof) | ||
| { | ||
| assert(p[i] == 9); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| include ../common.mak | ||
|
|
||
| TESTS:=memutils | ||
|
|
||
| .PHONY: all clean | ||
| all: $(addprefix $(ROOT)/,$(addsuffix .done,$(TESTS))) | ||
|
|
||
| $(ROOT)/%.done: $(ROOT)/% | ||
| @echo Testing $* | ||
| $(QUIET)$(TIMELIMIT)$(ROOT)/$* $(RUN_ARGS) | ||
| @touch $@ | ||
|
|
||
| $(ROOT)/%: $(SRC)/%.d | ||
| $(QUIET)$(DMD) $(DFLAGS) -of$@ $< | ||
|
|
||
| clean: | ||
| rm -rf $(ROOT) |
Uh oh!
There was an error while loading. Please reload this page.