具有擴展範圍的浮點類型

我需要在C中表示非常小的指令（例如，0.6745 * 2^{ - 3000}）的浮點數。有必要支持這種支持與平臺無關（工作在CPU和GPU-CUDA上）。有效數字的大長度不是必需的。我無法使用高精度庫（GMP，MPFR等），因爲它們不能在GPU上工作。另一方面，CUDA不支持long double類型。有沒有解決方法？是否有可能以某種方式實現自定義浮點類型？具有擴展範圍的浮點類型

來源

2015-03-19 Konstantin Isupov

你舉的例子值是一個'雙的範圍內'。 – 2015-03-19 08:23:32

@OliverCharlesworth謝謝。我糾正了例子。 – 2015-03-19 09:07:03

也許高精度庫可以適應GPU？ IIRC他們都有他們的算法的後備純C實現。 [CUMP]（http://www.hpcs.cs.tsukuba.ac.jp/~nakayama/cump/）似乎是這樣一個項目。 – user4815162342 2015-03-19 09:12:59

我寫的（使用this工作）簡單的解決方案：

#include <math.h> 
#include <stdint.h> 

#define DOUBLE_PRECISION 53 

/* DOUBLE PRECISION FLOATING-POINT TYPE WITH EXTENDED EXPONENT  */ 

typedef struct Real { 
    double sig;  //significand 
    long exp;  //binary exponent 
} real; 

/* UNION FOR DIVISION DOUBLE BY 2^POW        */ 

union DubleIntUnion 
{ 
    double  dvalue; 
    uint64_t ivalue; 
}; 


/* PLACE SIGNIFICAND OF REAL NUMBER IN RANGE [1, 2)   */ 

inline real adjust(real x){ 
    real y; 
    y.exp = x.exp; 
    y.sig = x.sig; 
    if(y.sig == 0){ 
     y.exp = 0; 
    } else if (fabs(y.sig) >= 2.0){ 
     y.exp = y.exp + 1; 
     y.sig = y.sig/2; 
    } else if(fabs(y.sig) < 1){ 
     y.exp = y.exp - 1; 
     y.sig = y.sig * 2; 
    } 
    return y; 
} 

/* PLACE SIGNIFICAND OF REAL NUMBER IN RANGE [1, 2) FOR TINY NUMBER */ 
/* FOR EXAMPLE, AFTER SUBTRATION OR WHEN SET REAL FROM DOUBLE   */ 

inline real adjusttiny(real x){ 
    real y; 
    y.exp = x.exp; 
    y.sig = x.sig; 
    while(1){ 
     x.exp = y.exp; 
     x.sig = y.sig; 
     y = adjust(x); 
     if(x.exp == y.exp && x.sig == y.sig) 
      break; 
    } 
    return y; 
} 

real set(double x){ 
    real y; 
    real z; 
    y.sig = x; 
    y.exp = 0; 
    return adjusttiny(y); 
}; 

real set(real x){ 
    real y; 
    y.exp = x.exp; 
    y.sig = x.sig; 
    return y; 
}; 

/* ARITHMETIC OPERATIONS */ 

//divide x by 2^pow. Assert that x.exp - pow > e_min 
inline double div2pow(const double x, const int pow) 
{ 
    DubleIntUnion diu; 
    diu.dvalue = x; 
    diu.ivalue -= (uint64_t)pow << 52;  // subtract pow from exponent 
    return diu.dvalue; 
} 

//summation 
inline real sum(real x, real y){    
    real sum; 
    int dexp = abs(x.exp - y.exp); 

    if (x.exp > y.exp){ 
     sum.exp = x.exp; 
     if(dexp <= DOUBLE_PRECISION){   
      sum.sig = div2pow(y.sig, dexp);  // divide y by 2^(x.exp - y.exp) 
      sum.sig = sum.sig + x.sig; 
     } else sum.sig = x.sig; 
    } else if (y.exp > x.exp){ 
     sum.exp = y.exp; 
     if(dexp <= DOUBLE_PRECISION){   
      sum.sig = div2pow(x.sig, dexp);  // divide x by 2^(y.exp - x.exp) 
      sum.sig = sum.sig + y.sig; 
     } else 
      sum.sig = y.sig; 
    } else { 
     sum.exp = x.exp; 
     sum.sig = x.sig + y.sig; 
    } 
    return adjust(sum); 
} 

//subtraction 
inline real sub(real x, real y){    
    real sub; 
    int dexp = abs(x.exp - y.exp); 

    if (x.exp > y.exp){ 
     sub.exp = x.exp; 
     if(dexp <= DOUBLE_PRECISION){   
      sub.sig = div2pow(y.sig, dexp); // divide y by 2^(x.exp - y.exp) 
      sub.sig = x.sig - sub.sig; 
     } else sub.sig = x.sig; 
    } else if (y.exp > x.exp){ 
     sub.exp = y.exp; 
     if(dexp <= DOUBLE_PRECISION){   
      sub.sig = div2pow(x.sig, dexp); // divide x by 2^(y.exp - x.exp) 
      sub.sig = sub.sig - y.sig; 
     } else sub.sig = -y.sig; 
    } else { 
     sub.exp = x.exp; 
     sub.sig = x.sig - y.sig; 
    } 
    return adjusttiny(sub); 
} 

//multiplication 
inline real mul(real x, real y){    
    real product; 
    product.exp = x.exp + y.exp; 
    product.sig = x.sig * y.sig; 
    return adjust(product); 
} 

//division 
inline real div(real x, real y){    
    real quotient; 
    quotient.exp = x.exp - y.exp; 
    quotient.sig = x.sig/y.sig; 
    return adjust(quotient); 
}

乍一看工作正常。也許我錯過了一些東西，或者實施可以加速？

如何在這些數字上實現功能floor和ceil？

來源

2015-03-19 10:30:12

因爲這個代碼是C++ ，'abs（）'是重載的，因此它可以像C++一樣工作。但是帖子被標記爲'C'並且'abs（）'適用於整數，所以這段代碼會使數學部分失敗。建議使用該帖子的編程語言發佈答案。 – chux 2015-03-19 14:55:12

在數學方面，1）IMO沒有看到使用範圍'（1/2，2）'而不是單個二進制範圍'[1,2]'的值。使用一個固定的範圍（當'exp> LONG_MIN'時）可以簡化各種操作。 2）像'1 << x.exp - y.exp'這樣的代碼存在很多問題，如果差值超過'long'位寬度，則爲UB; 3）代碼如'sum.sig = sum.sig + y。 sig;'不檢測溢出。 – chux 2015-03-19 15:07:38

@chux謝謝。如果你存儲一個單獨的指數，我會盡量考慮你的意見，並更正實施 – 2015-03-20 08:01:53

你可以在日誌空間工作，即代表每個號碼作爲電子郵件^X其中x是標準的浮點類型：

加法/減法（和求和更普遍）可以執行使用log-sum-exp trick，即
- Ë^X + E ^ŷ = E ^X（1 + E ^YX）= E ^{X +日誌（1 + EXP（YX））}
乘法/除法成爲加法/減法
- Ë^X ×ë^X = E ^{x + y}
擡高到po WER是相當簡單的
- （E ^X）^（E ^X）= E ^{X EXP（Y）}

來源

2015-03-19 14:42:02

如果你需要非常大的指數則symmetric level-index arithmetic可能滿足您的需求。然而，準確度難以預測，因此您可能需要更高的LI值來進行補償。提高精確度的一種常見方式是在CUDA上也使用很多的double-double arithmetic。

也有許多多倍庫的CUDA像CUMP

來源

2015-03-22 13:40:27

具有擴展範圍的浮點類型

回答

相關問題