11#if defined(__x86_64__) || defined(__amd64__) 
   15void Transform(uint32_t* s, 
const unsigned char* chunk, 
size_t blocks)
 
   17    static const uint32_t K256 
alignas(16) [] = {
 
   18        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 
   19        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 
   20        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 
   21        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 
   22        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 
   23        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 
   24        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 
   25        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 
   26        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 
   27        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 
   28        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 
   29        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 
   30        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 
   31        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 
   32        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 
   33        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 
   35    static const uint32_t FLIP_MASK 
alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
 
   36    static const uint32_t SHUF_00BA 
alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
 
   37    static const uint32_t SHUF_DC00 
alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
 
   38    uint32_t a, b, c, d, f, g, h, y0, y1, y2;
 
   40    uint64_t inp_end, inp;
 
   41    uint32_t xfer 
alignas(16) [4];
 
   63        "pshufb %%xmm12,%%xmm4;" 
   64        "movdqu 0x10(%1),%%xmm5;" 
   65        "pshufb %%xmm12,%%xmm5;" 
   66        "movdqu 0x20(%1),%%xmm6;" 
   67        "pshufb %%xmm12,%%xmm6;" 
   68        "movdqu 0x30(%1),%%xmm7;" 
   69        "pshufb %%xmm12,%%xmm7;" 
   74        "movdqa 0x0(%13),%%xmm9;" 
   75        "paddd  %%xmm4,%%xmm9;" 
   77        "movdqa %%xmm7,%%xmm0;" 
   81        "palignr $0x4,%%xmm6,%%xmm0;" 
   86        "movdqa %%xmm5,%%xmm1;" 
   89        "paddd  %%xmm4,%%xmm0;" 
   93        "palignr $0x4,%%xmm4,%%xmm1;" 
   97        "movdqa %%xmm1,%%xmm2;" 
  101        "movdqa %%xmm1,%%xmm3;" 
  105        "pslld  $0x19,%%xmm1;" 
  115        "movdqa %%xmm3,%%xmm2;" 
  118        "movdqa %%xmm3,%%xmm8;" 
  127        "psrld  $0x12,%%xmm2;" 
  132        "pxor   %%xmm3,%%xmm1;" 
  139        "pxor   %%xmm2,%%xmm1;" 
  143        "pxor   %%xmm8,%%xmm1;" 
  147        "pshufd $0xfa,%%xmm7,%%xmm2;" 
  150        "paddd  %%xmm1,%%xmm0;" 
  153        "movdqa %%xmm2,%%xmm3;" 
  157        "movdqa %%xmm2,%%xmm8;" 
  163        "psrlq  $0x11,%%xmm2;" 
  165        "psrlq  $0x13,%%xmm3;" 
  173        "pxor   %%xmm3,%%xmm2;" 
  177        "pxor   %%xmm2,%%xmm8;" 
  181        "pshufb %%xmm10,%%xmm8;" 
  185        "paddd  %%xmm8,%%xmm0;" 
  188        "pshufd $0x50,%%xmm0,%%xmm2;" 
  191        "movdqa %%xmm2,%%xmm3;" 
  195        "movdqa %%xmm2,%%xmm4;" 
  200        "psrlq  $0x11,%%xmm2;" 
  203        "psrlq  $0x13,%%xmm3;" 
  211        "pxor   %%xmm3,%%xmm2;" 
  215        "pxor   %%xmm2,%%xmm4;" 
  219        "pshufb %%xmm11,%%xmm4;" 
  223        "paddd  %%xmm0,%%xmm4;" 
  228        "movdqa 0x10(%13),%%xmm9;" 
  229        "paddd  %%xmm5,%%xmm9;" 
  231        "movdqa %%xmm4,%%xmm0;" 
  235        "palignr $0x4,%%xmm7,%%xmm0;" 
  240        "movdqa %%xmm6,%%xmm1;" 
  243        "paddd  %%xmm5,%%xmm0;" 
  247        "palignr $0x4,%%xmm5,%%xmm1;" 
  251        "movdqa %%xmm1,%%xmm2;" 
  255        "movdqa %%xmm1,%%xmm3;" 
  259        "pslld  $0x19,%%xmm1;" 
  269        "movdqa %%xmm3,%%xmm2;" 
  272        "movdqa %%xmm3,%%xmm8;" 
  281        "psrld  $0x12,%%xmm2;" 
  286        "pxor   %%xmm3,%%xmm1;" 
  293        "pxor   %%xmm2,%%xmm1;" 
  297        "pxor   %%xmm8,%%xmm1;" 
  301        "pshufd $0xfa,%%xmm4,%%xmm2;" 
  304        "paddd  %%xmm1,%%xmm0;" 
  307        "movdqa %%xmm2,%%xmm3;" 
  311        "movdqa %%xmm2,%%xmm8;" 
  317        "psrlq  $0x11,%%xmm2;" 
  319        "psrlq  $0x13,%%xmm3;" 
  327        "pxor   %%xmm3,%%xmm2;" 
  331        "pxor   %%xmm2,%%xmm8;" 
  335        "pshufb %%xmm10,%%xmm8;" 
  339        "paddd  %%xmm8,%%xmm0;" 
  342        "pshufd $0x50,%%xmm0,%%xmm2;" 
  345        "movdqa %%xmm2,%%xmm3;" 
  349        "movdqa %%xmm2,%%xmm5;" 
  354        "psrlq  $0x11,%%xmm2;" 
  357        "psrlq  $0x13,%%xmm3;" 
  365        "pxor   %%xmm3,%%xmm2;" 
  369        "pxor   %%xmm2,%%xmm5;" 
  373        "pshufb %%xmm11,%%xmm5;" 
  377        "paddd  %%xmm0,%%xmm5;" 
  382        "movdqa 0x20(%13),%%xmm9;" 
  383        "paddd  %%xmm6,%%xmm9;" 
  385        "movdqa %%xmm5,%%xmm0;" 
  389        "palignr $0x4,%%xmm4,%%xmm0;" 
  394        "movdqa %%xmm7,%%xmm1;" 
  397        "paddd  %%xmm6,%%xmm0;" 
  401        "palignr $0x4,%%xmm6,%%xmm1;" 
  405        "movdqa %%xmm1,%%xmm2;" 
  409        "movdqa %%xmm1,%%xmm3;" 
  413        "pslld  $0x19,%%xmm1;" 
  423        "movdqa %%xmm3,%%xmm2;" 
  426        "movdqa %%xmm3,%%xmm8;" 
  435        "psrld  $0x12,%%xmm2;" 
  440        "pxor   %%xmm3,%%xmm1;" 
  447        "pxor   %%xmm2,%%xmm1;" 
  451        "pxor   %%xmm8,%%xmm1;" 
  455        "pshufd $0xfa,%%xmm5,%%xmm2;" 
  458        "paddd  %%xmm1,%%xmm0;" 
  461        "movdqa %%xmm2,%%xmm3;" 
  465        "movdqa %%xmm2,%%xmm8;" 
  471        "psrlq  $0x11,%%xmm2;" 
  473        "psrlq  $0x13,%%xmm3;" 
  481        "pxor   %%xmm3,%%xmm2;" 
  485        "pxor   %%xmm2,%%xmm8;" 
  489        "pshufb %%xmm10,%%xmm8;" 
  493        "paddd  %%xmm8,%%xmm0;" 
  496        "pshufd $0x50,%%xmm0,%%xmm2;" 
  499        "movdqa %%xmm2,%%xmm3;" 
  503        "movdqa %%xmm2,%%xmm6;" 
  508        "psrlq  $0x11,%%xmm2;" 
  511        "psrlq  $0x13,%%xmm3;" 
  519        "pxor   %%xmm3,%%xmm2;" 
  523        "pxor   %%xmm2,%%xmm6;" 
  527        "pshufb %%xmm11,%%xmm6;" 
  531        "paddd  %%xmm0,%%xmm6;" 
  536        "movdqa 0x30(%13),%%xmm9;" 
  537        "paddd  %%xmm7,%%xmm9;" 
  540        "movdqa %%xmm6,%%xmm0;" 
  544        "palignr $0x4,%%xmm5,%%xmm0;" 
  549        "movdqa %%xmm4,%%xmm1;" 
  552        "paddd  %%xmm7,%%xmm0;" 
  556        "palignr $0x4,%%xmm7,%%xmm1;" 
  560        "movdqa %%xmm1,%%xmm2;" 
  564        "movdqa %%xmm1,%%xmm3;" 
  568        "pslld  $0x19,%%xmm1;" 
  578        "movdqa %%xmm3,%%xmm2;" 
  581        "movdqa %%xmm3,%%xmm8;" 
  590        "psrld  $0x12,%%xmm2;" 
  595        "pxor   %%xmm3,%%xmm1;" 
  602        "pxor   %%xmm2,%%xmm1;" 
  606        "pxor   %%xmm8,%%xmm1;" 
  610        "pshufd $0xfa,%%xmm6,%%xmm2;" 
  613        "paddd  %%xmm1,%%xmm0;" 
  616        "movdqa %%xmm2,%%xmm3;" 
  620        "movdqa %%xmm2,%%xmm8;" 
  626        "psrlq  $0x11,%%xmm2;" 
  628        "psrlq  $0x13,%%xmm3;" 
  636        "pxor   %%xmm3,%%xmm2;" 
  640        "pxor   %%xmm2,%%xmm8;" 
  644        "pshufb %%xmm10,%%xmm8;" 
  648        "paddd  %%xmm8,%%xmm0;" 
  651        "pshufd $0x50,%%xmm0,%%xmm2;" 
  654        "movdqa %%xmm2,%%xmm3;" 
  658        "movdqa %%xmm2,%%xmm7;" 
  663        "psrlq  $0x11,%%xmm2;" 
  666        "psrlq  $0x13,%%xmm3;" 
  674        "pxor   %%xmm3,%%xmm2;" 
  678        "pxor   %%xmm2,%%xmm7;" 
  682        "pshufb %%xmm11,%%xmm7;" 
  686        "paddd  %%xmm0,%%xmm7;" 
  696        "paddd  0x0(%13),%%xmm4;" 
  810        "paddd  0x10(%13),%%xmm5;" 
  925        "movdqa %%xmm6,%%xmm4;" 
  926        "movdqa %%xmm7,%%xmm5;" 
  952        : 
"+r"(s), 
"+r"(chunk), 
"+r"(blocks), 
"=r"(a), 
"=r"(b), 
"=r"(c), 
"=r"(d),  
"=r"(f), 
"=r"(g), 
"=r"(h), 
"=r"(y0), 
"=r"(y1), 
"=r"(y2), 
"=r"(tbl), 
"+m"(inp_end), 
"+m"(inp), 
"+m"(xfer)
 
  953        : 
"m"(K256), 
"m"(FLIP_MASK), 
"m"(SHUF_00BA), 
"m"(SHUF_DC00)
 
  954        : 
"cc", 
"memory", 
"xmm0", 
"xmm1", 
"xmm2", 
"xmm3", 
"xmm4", 
"xmm5", 
"xmm6", 
"xmm7", 
"xmm8", 
"xmm9", 
"xmm10", 
"xmm11", 
"xmm12" 
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)