CRT:memsetの実装についての考察


Step1: シンプルに実装(1byteずつ代入)

1
2
3
4
5
6
7
8
9
void* memset(void* p, int c, size_t n)
{
    register const unsigned char ch8 = (unsigned char)(c & 0xFF);
    register unsigned char* ptr8 = (unsigned char*)p;
    while (n--) {
        *ptr8++ = ch8; //< 1バイトずつ代入
    }
    return p;
}

Step2: Windows(32bit版)用に変更

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
void* memset(void* p, int c, size_t n)
{
    const unsigned char ch8 = (unsigned char)(c & 0xFF);
    unsigned char* ptr8 = (unsigned char*)p;
    if (n >= 4U) {
        register unsigned __int32* ptr32 = (unsigned __int32*)ptr8;
        register unsigned __int32 ch32 = ch8;
        if (ch32) {
            ch32 |= ch32 << 8;
            ch32 |= ch32 << 16;
        }
        while (n >= 4U) {
          *ptr32++ = ch32; //< 4バイトずつ代入
          n -= 4U;
        }
        ptr8 = (unsigned char*)ptr32;
    }
    while (n--) {
        *ptr8++ = ch8; //< 1バイトずつ代入
    }
    return p;
}

Step3: Windows(32bit版、64bit版)用に変更(バイト境界も加味)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
void* memset(void* p, int c, size_t n)
{
    const unsigned char ch8 = (unsigned char)(c & 0xFF);
    unsigned char* ptr8 = (unsigned char*)p;
    unsigned align = (((unsigned)ptr8) & (sizeof(void*) - 1));
    if (n >= (align + sizeof(void*))) {
        // word boundary
        while (align--) {
            *ptr8++ = ch8;
            --n;
        }
        register unsigned __int3264* ptr3264 = (unsigned __int3264*)ptr8;
        register unsigned __int3264 ch3264 = ch8;
        if (ch3264) {
            ch3264 |= ch3264 << 8;
            ch3264 |= ch3264 << 16;
#if defined(_WIN64)
            ch3264 |= ch3264 << 32;
#endif
        }
        while (n >= sizeof(void*)) {
          *ptr3264++ = ch3264;
          n -= sizeof(void*);
        }
        ptr8 = (unsigned char*)ptr3264;
    }
    while (n--) {
        *ptr8++ = ch8;
    }
    return p;
}

アセンブル結果

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
void* memset(void* buffer, int c, size_t n)
{
00CC10B0  push        ebp
00CC10B1  mov         ebp,esp
00CC10B3  push        ecx
    const unsigned char ch8 = (unsigned char)(c & 0xFF);
    unsigned char* ptr8 = (unsigned char*)buffer;
    unsigned align = (((unsigned)ptr8) & (sizeof(void*) - 1));
    if (n >= (align + sizeof(void*))) {
00CC10B4  mov         edx,dword ptr [n]
00CC10B7  push        ebx
00CC10B8  push        esi
00CC10B9  mov         esi,dword ptr [buffer]
00CC10BC  mov         ebx,esi
00CC10BE  mov         ecx,esi
00CC10C0  and         ecx,3
00CC10C3  push        edi
00CC10C4  lea         eax,[ecx+4]
00CC10C7  cmp         edx,eax
00CC10C9  jb          memset+7Ah (0CC112Ah)
        // word boundary
        while (align--) {
00CC10CB  test        ecx,ecx
00CC10CD  je          memset+44h (0CC10F4h)
00CC10CF  movzx       eax,byte ptr [c]
00CC10D3  mov         edi,esi
00CC10D5  mov         esi,ecx
00CC10D7  imul        eax,eax,1010101h
00CC10DD  shr         ecx,2
00CC10E0  rep stos    dword ptr es:[edi]
00CC10E2  mov         ecx,esi
00CC10E4  and         ecx,3
00CC10E7  rep stos    byte ptr es:[edi]
00CC10E9  mov         eax,esi
00CC10EB  add         ebx,eax
00CC10ED  lea         ecx,[ecx]
            *ptr8++ = ch8;
            --n;
00CC10F0  dec         edx
00CC10F1  dec         eax
00CC10F2  jne         memset+40h (0CC10F0h)
        }
        unsigned __int3264* ptr3264 = (unsigned __int3264*)ptr8;
        unsigned __int3264 ch3264 = ch8;
00CC10F4  mov         eax,dword ptr [c]
00CC10F7  movzx       edi,al
        if (ch3264) {
00CC10FA  test        al,al
00CC10FC  je          memset+5Ch (0CC110Ch)
            ch3264 |= ch3264 << 8;
00CC10FE  mov         eax,edi
00CC1100  shl         eax,8
            ch3264 |= ch3264 << 8;
00CC1103  or          edi,eax
            ch3264 |= ch3264 << 16;
00CC1105  mov         eax,edi
00CC1107  shl         eax,10h
00CC110A  or          edi,eax
#if defined(_WIN64)
            ch3264 |= ch3264 << 32;
#endif
        }
        while (n >= sizeof(void*)) {
00CC110C  cmp         edx,4
00CC110F  jb          memset+77h (0CC1127h)
00CC1111  mov         esi,edx
00CC1113  mov         eax,edi
00CC1115  shr         esi,2
00CC1118  mov         edi,ebx
00CC111A  mov         ecx,esi
00CC111C  rep stos    dword ptr es:[edi]
00CC111E  lea         ebx,[ebx+esi*4]
          *ptr3264++ = ch3264;
          n -= sizeof(void*);
00CC1121  sub         edx,4
00CC1124  dec         esi
00CC1125  jne         memset+71h (0CC1121h)
00CC1127  mov         esi,dword ptr [buffer]
        }
        ptr8 = (unsigned char*)ptr3264;
    }
    while (n--) {
00CC112A  test        edx,edx
00CC112C  je          memset+0A4h (0CC1154h)
00CC112E  movzx       eax,byte ptr [c]
00CC1132  mov         ecx,edx
00CC1134  mov         esi,ecx
00CC1136  imul        eax,eax,1010101h
00CC113C  shr         ecx,2
00CC113F  mov         edi,ebx
00CC1141  rep stos    dword ptr es:[edi]
00CC1143  mov         ecx,esi
00CC1145  and         ecx,3
00CC1148  rep stos    byte ptr es:[edi]
        *ptr8++ = ch8;
    }
    return buffer;
00CC114A  mov         eax,dword ptr [buffer]
00CC114D  pop         edi
00CC114E  pop         esi
00CC114F  pop         ebx
}
00CC1150  mov         esp,ebp
00CC1152  pop         ebp
00CC1153  ret
00CC1154  pop         edi
        *ptr8++ = ch8;
    }
    return buffer;
00CC1155  mov         eax,esi
00CC1157  pop         esi
00CC1158  pop         ebx
}
00CC1159  mov         esp,ebp
00CC115B  pop         ebp
00CC115C  ret