我不知道有任何优化适用于整数分用恒定的红利。 双重检查,我尝试一试验的情况下与所有的股息与 编译器Explorer. 使用的海湾合作委员会,国际刑事法院和铛,与最高优化水平指定的,产生的代码表明没有优化被应用来划分的。
这当然是可能的建立高性能的128位司的程序,但是从个人的经验,我知道,这是相当容易出错,并且非常复杂的测试是需要实现良好测试的复盖范围包括角的情况下,为详尽的测试是不可能在这个操作数的大小。 努力对于设计和试验很容易超过什么似乎是合理的答案在计算器由两位小数的数量级。
一个简单的方法来执行整数分是使用算法我们所有了解到在小学,只有在二进制的。 这使得决定下一个商数位特别容易的:它是1当前的局部其余大于或等于,除,否则为0。 使用普通的二进制的划分,只有整行动,我们需要的是加法和减法。
我们可以建立便携元用于执行这些操作数上的任何一位长通过模仿的方式处理器的机器的指示是用于操作上的多词整数:增加与进行中,添加有进行,添加与携带和进出;类似于子。 在代码下面我使用简单的C宏;肯定更加复杂的方式都是可能的。
由于该系统的我的工作现在不支持128位整数,我推出原型并测试了这种方法为64位整数。 128位的版本,然后是一项简单的机械重新命名。 在一个现代化的64位处理我希望这128位司的功能来执行在大约3000周期。
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <limits.h>
#define SUBCcc(a,b,cy,t0,t1,t2) \
(t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)
#define SUBcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), cy=t1<t0, t1-t0)
#define SUBC(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), t1-t0)
#define ADDCcc(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)
#define ADDcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)
#define ADDC(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), t0+t1)
typedef struct {
uint64_t l;
uint64_t h;
} my_uint128;
my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
my_uint128 quot, rem, tmp;
uint64_t cy, t0, t1, t2;
int bits_left = CHAR_BIT * sizeof (my_uint128);
quot.h = dvnd.h;
quot.l = dvnd.l;
rem.h = 0;
rem.l = 0;
do {
quot.l = ADDcc (quot.l, quot.l, cy, t0, t1);
quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
rem.l = ADDCcc (rem.l, rem.l, cy, t0, t1);
rem.h = ADDC (rem.h, rem.h, cy, t0, t1);
tmp.l = SUBcc (rem.l, dvsr.l, cy, t0, t1);
tmp.h = SUBCcc (rem.h, dvsr.h, cy, t0, t1, t2);
if (!cy) { // remainder >= divisor
rem.l = tmp.l;
rem.h = tmp.h;
quot.l = quot.l | 1;
}
bits_left--;
} while (bits_left);
return quot;
}
typedef struct {
uint32_t l;
uint32_t h;
} my_uint64;
my_uint64 bitwise_division_64 (my_uint64 dvnd, my_uint64 dvsr)
{
my_uint64 quot, rem, tmp;
uint32_t cy, t0, t1, t2;
int bits_left = CHAR_BIT * sizeof (my_uint64);
quot.h = dvnd.h;
quot.l = dvnd.l;
rem.h = 0;
rem.l = 0;
do {
quot.l = ADDcc (quot.l, quot.l, cy, t0, t1);
quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
rem.l = ADDCcc (rem.l, rem.l, cy, t0, t1);
rem.h = ADDC (rem.h, rem.h, cy, t0, t1);
tmp.l = SUBcc (rem.l, dvsr.l, cy, t0, t1);
tmp.h = SUBCcc (rem.h, dvsr.h, cy, t0, t1, t2);
if (!cy) { // remainder >= divisor
rem.l = tmp.l;
rem.h = tmp.h;
quot.l = quot.l | 1;
}
bits_left--;
} while (bits_left);
return quot;
}
/*
https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
From: geo <[email protected]>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components, each nearly
good enough to serve alone. The components are:
Multiply-With-Carry (MWC), period (2^121+2^63-1)
Xorshift (XSH), period 2^64-1
Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \
kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
kiss64_y ^= (kiss64_y << 43))
#define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
int main (void)
{
uint64_t a, b, res, ref;
my_uint64 aa, bb, rr;
do {
a = KISS64;
b = KISS64;
ref = a / b;
aa.l = (uint32_t)a;
aa.h = (uint32_t)(a >> 32);
bb.l = (uint32_t)b;
bb.h = (uint32_t)(b >> 32);
rr = bitwise_division_64 (aa, bb);
res = (((uint64_t)rr.h) << 32) + rr.l;
if (ref != res) {
printf ("a=%016llx b=%016llx res=%016llx ref=%016llx\n", a, b, res, ref);
return EXIT_FAILURE;
}
} while (a);
return EXIT_SUCCESS;
}
一个速度更快的方法比位的计算,以计算的倒数除数,乘以所得的股息,在初步商,然后计算的剩余部分来精确调整商。 整个计算可以通过在固定点运算。 然而,在现代的处理器快速浮点的单位,这是更方便的生成开始的近似于互惠的双精确的划分。 一个单一的哈雷迭代立方汇聚后的效果,在完整精度的互惠。
本哈雷迭代的相互是非常整数乘密集型的,有64×64位乘128位的结果(umul64wide()
在代码下面)正在建设框至关重要的性能。 在现代64位架构,这通常是一个单一的机指令中执行的几个周期,但是这不是可以携带的代码。 便携式码模拟的指令大约需要15至20说明这取决于建筑和编译器。
整个128位司应采取大约300个周期,或十倍的快速简单位计算。 因为代码是相当复杂,需要大量的测试,以确保正确操作。 在该框架下面我使用基于模式和随机测试的中等密集测试、使用简单位执行情况作为参考。
执行情况 udiv128()
以下假设编程的环境使用IEEE-754符合浮点运算, double
类型是映射到IEEE-754的 binary64
类型,以及该司的 double
操作是否正确圆。
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <limits.h>
typedef struct {
uint64_t l;
uint64_t h;
} my_uint128;
my_uint128 make_my_uint128 (uint64_t h, uint64_t l);
my_uint128 add128 (my_uint128 a, my_uint128 b);
my_uint128 sub128 (my_uint128 a, my_uint128 b);
my_uint128 lsl128 (my_uint128 a, int sh);
my_uint128 lsr128 (my_uint128 a, int sh);
my_uint128 not128 (my_uint128 a);
my_uint128 umul128lo (my_uint128 a, my_uint128 b);
my_uint128 umul128hi (my_uint128 a, my_uint128 b);
double my_uint128_to_double (my_uint128 a);
int lt128 (my_uint128 a, my_uint128 b);
int eq128 (my_uint128 a, my_uint128 b);
uint64_t double_as_uint64 (double a);
double uint64_as_double (uint64_t a);
#define FP64_EXPO_BIAS (1023)
#define FP64_MANT_BITS (53)
#define FP64_MANT_IBIT (0x0010000000000000ULL)
#define FP64_MANT_MASK (0x000fffffffffffffULL)
#define FP64_INC_EXP_128 (0x0800000000000000ULL)
#define FP64_MANT_ADJ (2) // adjustment to ensure underestimate
my_uint128 udiv128 (my_uint128 dividend, my_uint128 divisor)
{
const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
const my_uint128 one = make_my_uint128 (0ULL, 1ULL);
const my_uint128 two = make_my_uint128 (0ULL, 2ULL);
my_uint128 recip, temp, quo, rem;
my_uint128 neg_divisor = sub128 (zero, divisor);
double r;
/* compute initial approximation for reciprocal; must be underestimate! */
r = 1.0 / my_uint128_to_double (divisor);
uint64_t i = double_as_uint64 (r) - FP64_MANT_ADJ + FP64_INC_EXP_128;
temp = make_my_uint128 (0ULL, (i & FP64_MANT_MASK) | FP64_MANT_IBIT);
int sh = (i >> (FP64_MANT_BITS-1)) - FP64_EXPO_BIAS - (FP64_MANT_BITS-1);
recip = (sh < 0) ? lsr128 (temp, -sh) : lsl128 (temp, sh);
/* perform Halley iteration with cubic convergence to refine reciprocal */
temp = umul128lo (neg_divisor, recip);
temp = add128 (umul128hi (temp, temp), temp);
recip = add128 (umul128hi (recip, temp), recip);
/* compute preliminary quotient and remainder */
quo = umul128hi (dividend, recip);
rem = sub128 (dividend, umul128lo (divisor, quo));
/* adjust quotient if too small; quotient off by 2 at most */
if (! lt128 (rem, divisor)) {
quo = add128 (quo, lt128 (sub128 (rem, divisor), divisor) ? one : two);
}
/* handle division by zero */
if (eq128 (divisor, zero)) quo = not128 (zero);
return quo;
}
#define SUBCcc(a,b,cy,t0,t1,t2) \
(t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)
#define SUBcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), cy=t1<t0, t1-t0)
#define SUBC(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), t1-t0)
#define ADDCcc(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)
#define ADDcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)
#define ADDC(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), t0+t1)
uint64_t double_as_uint64 (double a)
{
uint64_t r;
memcpy (&r, &a, sizeof r);
return r;
}
double uint64_as_double (uint64_t a)
{
double r;
memcpy (&r, &a, sizeof r);
return r;
}
my_uint128 add128 (my_uint128 a, my_uint128 b)
{
uint64_t cy, t0, t1;
a.l = ADDcc (a.l, b.l, cy, t0, t1);
a.h = ADDC (a.h, b.h, cy, t0, t1);
return a;
}
my_uint128 sub128 (my_uint128 a, my_uint128 b)
{
uint64_t cy, t0, t1;
a.l = SUBcc (a.l, b.l, cy, t0, t1);
a.h = SUBC (a.h, b.h, cy, t0, t1);
return a;
}
my_uint128 lsl128 (my_uint128 a, int sh)
{
if (sh >= 64) {
a.h = a.l << (sh - 64);
a.l = 0ULL;
} else if (sh) {
a.h = (a.h << sh) + (a.l >> (64 - sh));
a.l = a.l << sh;
}
return a;
}
my_uint128 lsr128 (my_uint128 a, int sh)
{
if (sh >= 64) {
a.l = a.h >> (sh - 64);
a.h = 0ULL;
} else if (sh) {
a.l = (a.l >> sh) + (a.h << (64 - sh));
a.h = a.h >> sh;
}
return a;
}
my_uint128 not128 (my_uint128 a)
{
a.l = ~a.l;
a.h = ~a.h;
return a;
}
int lt128 (my_uint128 a, my_uint128 b)
{
uint64_t cy, t0, t1, t2;
a.l = SUBcc (a.l, b.l, cy, t0, t1);
a.h = SUBCcc (a.h, b.h, cy, t0, t1, t2);
return cy;
}
int eq128 (my_uint128 a, my_uint128 b)
{
return (a.l == b.l) && (a.h == b.h);
}
// derived from Hacker's Delight 2nd ed. figure 8-2
my_uint128 umul64wide (uint64_t u, uint64_t v)
{
my_uint128 r;
uint64_t u0, v0, u1, v1, w0, w1, w2, t;
u0 = (uint32_t)u; u1 = u >> 32;
v0 = (uint32_t)v; v1 = v >> 32;
w0 = u0 * v0;
t = u1 * v0 + (w0 >> 32);
w1 = (uint32_t)t;
w2 = t >> 32;
w1 = u0 * v1 + w1;
r.h = u1 * v1 + w2 + (w1 >> 32);
r.l = (w1 << 32) + (uint32_t)w0;
return r;
}
my_uint128 make_my_uint128 (uint64_t h, uint64_t l)
{
my_uint128 r;
r.h = h;
r.l = l;
return r;
}
my_uint128 umul128lo (my_uint128 a, my_uint128 b)
{
my_uint128 r;
r = umul64wide (a.l, b.l);
r.h = r.h + a.l * b.h + a.h * b.l;
return r;
}
my_uint128 umul128hi (my_uint128 a, my_uint128 b)
{
my_uint128 t0, t1, t2, t3;
t0 = umul64wide (a.l, b.l);
t3 = add128 (umul64wide (a.h, b.l), make_my_uint128 (0ULL, t0.h));
t1 = make_my_uint128 (0ULL, t3.l);
t2 = make_my_uint128 (0ULL, t3.h);
t1 = add128 (umul64wide (a.l, b.h), t1);
return add128 (add128 (umul64wide (a.h, b.h), t2), make_my_uint128 (0ULL, t1.h));
}
double my_uint128_to_double (my_uint128 a)
{
const int intbits = sizeof (a) * CHAR_BIT;
const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
my_uint128 rnd, i = a;
uint64_t j;
int sh = 0;
double r;
// normalize integer so MSB is set
if (lt128 (i, make_my_uint128(0x0000000000000001ULL, 0))) {i = lsl128 (i,64); sh += 64; }
if (lt128 (i, make_my_uint128(0x0000000100000000ULL, 0))) {i = lsl128 (i,32); sh += 32; }
if (lt128 (i, make_my_uint128(0x0001000000000000ULL, 0))) {i = lsl128 (i,16); sh += 16; }
if (lt128 (i, make_my_uint128(0x0100000000000000ULL, 0))) {i = lsl128 (i, 8); sh += 8; }
if (lt128 (i, make_my_uint128(0x1000000000000000ULL, 0))) {i = lsl128 (i, 4); sh += 4; }
if (lt128 (i, make_my_uint128(0x4000000000000000ULL, 0))) {i = lsl128 (i, 2); sh += 2; }
if (lt128 (i, make_my_uint128(0x8000000000000000ULL, 0))) {i = lsl128 (i, 1); sh += 1; }
// form mantissa with explicit integer bit
rnd = lsl128 (i, FP64_MANT_BITS);
i = lsr128 (i, intbits - FP64_MANT_BITS);
j = i.l;
// add in exponent, taking into account integer bit of mantissa
if (! eq128 (a, zero)) {
j += (uint64_t)(FP64_EXPO_BIAS + (intbits-1) - 1 - sh) << (FP64_MANT_BITS-1);
}
// round to nearest or even
rnd.h = rnd.h | (rnd.l != 0);
if ((rnd.h > 0x8000000000000000ULL) ||
((rnd.h == 0x8000000000000000ULL) && (j & 1))) j++;
// reinterpret bit pattern as IEEE-754 'binary64'
r = uint64_as_double (j);
return r;
}
my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
my_uint128 quot, rem, tmp;
uint64_t cy, t0, t1, t2;
int bits_left = CHAR_BIT * sizeof (dvsr);
quot.h = dvnd.h;
quot.l = dvnd.l;
rem.h = 0;
rem.l = 0;
do {
quot.l = ADDcc (quot.l, quot.l, cy, t0, t1);
quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
rem.l = ADDCcc (rem.l, rem.l, cy, t0, t1);
rem.h = ADDC (rem.h, rem.h, cy, t0, t1);
tmp.l = SUBcc (rem.l, dvsr.l, cy, t0, t1);
tmp.h = SUBCcc (rem.h, dvsr.h, cy, t0, t1, t2);
if (!cy) { // remainder >= divisor
rem.l = tmp.l;
rem.h = tmp.h;
quot.l = quot.l | 1;
}
bits_left--;
} while (bits_left);
return quot;
}
/*
https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
From: geo <[email protected]>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components, each nearly
good enough to serve alone. The components are:
Multiply-With-Carry (MWC), period (2^121+2^63-1)
Xorshift (XSH), period 2^64-1
Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \
kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
kiss64_y ^= (kiss64_y << 43))
#define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
my_uint128 v[100000]; /* FIXME: size appropriately */
int main (void)
{
const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
const my_uint128 one = make_my_uint128 (0ULL, 1ULL);
my_uint128 dividend, divisor, quot, ref;
int i, j, patterns, idx = 0, nbrBits = sizeof (v[0]) * CHAR_BIT;
int patterns_done = 0;
/* pattern class 1: 2**i */
for (i = 0; i < nbrBits; i++) {
v [idx] = lsl128 (one, i);
idx++;
}
/* pattern class 2: 2**i-1 */
for (i = 0; i < nbrBits; i++) {
v [idx] = sub128 (lsl128 (one, i), one);
idx++;
}
/* pattern class 3: 2**i+1 */
for (i = 0; i < nbrBits; i++) {
v [idx] = add128 (lsl128 (one, i), one);
idx++;
}
/* pattern class 4: 2**i + 2**j */
for (i = 0; i < nbrBits; i++) {
for (j = 0; j < nbrBits; j++) {
v [idx] = add128 (lsl128 (one, i), lsl128 (one, j));
idx++;
}
}
/* pattern class 5: 2**i - 2**j */
for (i = 0; i < nbrBits; i++) {
for (j = 0; j < nbrBits; j++) {
v [idx] = sub128 (lsl128 (one, i), lsl128 (one, j));
idx++;
}
}
patterns = idx;
/* pattern class 6: one's complement of pattern classes 1 through 5 */
for (i = 0; i < patterns; i++) {
v [idx] = not128 (v [i]);
idx++;
}
/* pattern class 7: two's complement of pattern classes 1 through 5 */
for (i = 0; i < patterns; i++) {
v [idx] = sub128 (zero, v[i]);
idx++;
}
patterns = idx;
printf ("Starting pattern-based tests. Number of patterns: %d\n", patterns);
for (long long int k = 0; k < 100000000000LL; k++) {
if (k < patterns * patterns) {
dividend = v [k / patterns];
divisor = v [k % patterns];
} else {
if (!patterns_done) {
printf ("Starting random tests\n");
patterns_done = 1;
}
dividend.l = KISS64;
dividend.h = KISS64;
divisor.h = KISS64;
divisor.l = KISS64;
}
/* exclude cases with undefined results: division by zero */
if (! eq128 (divisor, zero)) {
quot = udiv128 (dividend, divisor);
ref = bitwise_division_128 (dividend, divisor);
if (! eq128 (quot, ref)) {
printf ("@ (%016llx_%016llx, %016llx_%016llx): quot = %016llx_%016llx ref=%016llx_%016llx\n",
dividend.h, dividend.l, divisor.h, divisor.l,
quot.h, quot.l, ref.h, ref.l);
return EXIT_FAILURE;
}
}
}
printf ("unsigned 128-bit division: tests passed\n");
return EXIT_SUCCESS;
}