Bug#1031500: numerical differences in x87 fpu code generated by clang-14 and clang-15

Fri Feb 17 09:55:42 GMT 2023

Package: clang-15
Version: 1:15.0.6-4
Severity: important
Control: block 1031414 with -1

While analyzing some autopkgtest regressions when pocl gets built with
llvm-15 instead of llvm-14 (#1030298, #1031414), I finally managed to
extract some parts of the kernel code generated by pocl (parts of the
pocl OpenCL fmod(float, float) implementation, pocl_cl_fmod.ll) and
added a small C wrapper to call the function (main.c, Makefile).
The resulting program produces numeric results that differ in more than
just the last digit depending on whether it was built with clang-14
or clang-15.

$ ./test_pocl_cl_fmod_llvm14
a=0 b=0.1 c=0 d=0 abserr=0 relerr=0
a=0.1 b=0.1221239 c=0.1 d=0.1 abserr=0 relerr=0
a=0.2 b=0.14424779 c=0.055752218 d=0.055752218 abserr=0 relerr=0
a=0.3 b=0.16637169 c=0.13362832 d=0.13362832 abserr=0 relerr=0
a=0.4 b=0.18849558 c=0.023008853 d=0.023008853 abserr=0 relerr=0
a=0.5 b=0.21061946 c=0.078761071 d=0.078761071 abserr=0 relerr=0
a=0.6 b=0.23274337 c=0.13451329 d=0.13451329 abserr=0 relerr=0
a=0.7 b=0.25486726 c=0.19026548 d=0.19026548 abserr=0 relerr=0
a=0.8 b=0.27699116 c=0.24601769 d=0.24601769 abserr=0 relerr=0
a=0.9 b=0.29911503 c=0.0026548803 d=0.0026548803 abserr=0 relerr=0

$ ./test_pocl_cl_fmod_llvm15
a=0 b=0.1 c=0 d=0 abserr=0 relerr=0
a=0.1 b=0.1221239 c=0.1 d=0.1 abserr=0 relerr=0
a=0.2 b=0.14424779 c=0.055752218 d=0.055752218 abserr=0 relerr=0
a=0.3 b=0.16637169 c=0.13362832 d=0.13362832 abserr=0 relerr=0
a=0.4 b=0.18849558 c=0.023008853 d=0.023008853 abserr=0 relerr=0
a=0.5 b=0.21061946 c=0.078761071 d=0.078761071 abserr=0 relerr=0
a=0.6 b=0.23274337 c=0.13451327 d=0.13451329 abserr=1.49012e-08 relerr=6.4024e-08
a=0.7 b=0.25486726 c=0.19026548 d=0.19026548 abserr=0 relerr=0
a=0.8 b=0.27699116 c=0.24601772 d=0.24601769 abserr=-2.98023e-08 relerr=-1.07593e-07 FAIL
a=0.9 b=0.29911503 c=0.0026548505 d=0.0026548803 abserr=2.98023e-08 relerr=9.9635e-08

c = _Z8_cl_fmodff(a, b)  # from pocl
d = fmodf(a, b)          # from libc, as reference

This happens on i386 with -march=i686, i.e. it generates x87 fpu code,
not sse fpu code. The attached Makefile also works for creating 32-bit
binaries on amd64 showing the behavior.

Andreas
-------------- next part --------------
; ModuleID = 'parallel.bc'
source_filename = "parallel_bc"
target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386-unknown-linux-gnu"

; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
declare float @llvm.fma.f32(float, float, float) #7

; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone uwtable willreturn
define float @_Z8_cl_fmodff(float noundef %0, float noundef %1) unnamed_addr #8 {
.r_entry:
  %2 = bitcast float %0 to i32
  %3 = and i32 %2, 2147483647
  %4 = bitcast i32 %3 to float
  %5 = bitcast float %1 to i32
  %6 = and i32 %5, 2147483647
  %7 = bitcast i32 %6 to float
  %8 = fcmp olt float %4, 0x3810000000000000
  %9 = fmul float %4, 0x41D0000000000000
  %10 = select i1 %8, float %9, float %4
  %11 = fcmp ueq float %10, 0.000000e+00
  br i1 %11, label %21, label %12

12:                                               ; preds = %.r_entry
  %13 = select i1 %8, i32 -156, i32 -126
  %14 = bitcast float %10 to i32
  %15 = lshr i32 %14, 23
  %16 = and i32 %15, 255
  %17 = add nsw i32 %16, %13
  %18 = fcmp une float %10, 0x7FF0000000000000
  %19 = fcmp une float %10, 0xFFF0000000000000
  %20 = and i1 %18, %19
  br i1 %20, label %_Z12_cl_expfrexpf.exit, label %21

21:                                               ; preds = %12, %.r_entry
  br label %_Z12_cl_expfrexpf.exit

_Z12_cl_expfrexpf.exit:                           ; preds = %21, %12
  %22 = phi i32 [ 0, %21 ], [ %17, %12 ]
  %23 = fcmp ord float %4, 0.000000e+00
  br i1 %23, label %24, label %_Z11_cl_frfrexpf.exit

24:                                               ; preds = %_Z12_cl_expfrexpf.exit
  %25 = bitcast float %10 to i32
  %26 = and i32 %25, -2139095041
  %27 = or i32 %26, 1056964608
  %28 = fcmp une float %10, 0x7FF0000000000000
  %29 = fcmp une float %10, 0xFFF0000000000000
  %30 = and i1 %28, %29
  %31 = and i32 %25, -2147483648
  %32 = or i32 %31, 2139095040
  %33 = select i1 %30, i32 %27, i32 %32
  %34 = bitcast i32 %33 to float
  %35 = fcmp oeq float %10, 0.000000e+00
  %36 = select i1 %35, float %10, float %34
  br label %_Z11_cl_frfrexpf.exit

_Z11_cl_frfrexpf.exit:                            ; preds = %24, %_Z12_cl_expfrexpf.exit
  %37 = phi float [ %36, %24 ], [ %4, %_Z12_cl_expfrexpf.exit ]
  %38 = fmul float %37, 4.096000e+03
  %39 = fcmp olt float %7, 0x3810000000000000
  %40 = fmul float %7, 0x41D0000000000000
  %41 = select i1 %39, float %40, float %7
  %42 = fcmp ueq float %41, 0.000000e+00
  br i1 %42, label %52, label %43

43:                                               ; preds = %_Z11_cl_frfrexpf.exit
  %44 = select i1 %39, i32 -156, i32 -126
  %45 = bitcast float %41 to i32
  %46 = lshr i32 %45, 23
  %47 = and i32 %46, 255
  %48 = add nsw i32 %47, %44
  %49 = fcmp une float %41, 0x7FF0000000000000
  %50 = fcmp une float %41, 0xFFF0000000000000
  %51 = and i1 %49, %50
  br i1 %51, label %_Z12_cl_expfrexpf.exit1, label %52

52:                                               ; preds = %43, %_Z11_cl_frfrexpf.exit
  br label %_Z12_cl_expfrexpf.exit1

_Z12_cl_expfrexpf.exit1:                          ; preds = %52, %43
  %53 = phi i32 [ 0, %52 ], [ %48, %43 ]
  %54 = fcmp ord float %7, 0.000000e+00
  br i1 %54, label %55, label %_Z11_cl_frfrexpf.exit2

55:                                               ; preds = %_Z12_cl_expfrexpf.exit1
  %56 = bitcast float %41 to i32
  %57 = and i32 %56, -2139095041
  %58 = or i32 %57, 1056964608
  %59 = fcmp une float %41, 0x7FF0000000000000
  %60 = fcmp une float %41, 0xFFF0000000000000
  %61 = and i1 %59, %60
  %62 = and i32 %56, -2147483648
  %63 = or i32 %62, 2139095040
  %64 = select i1 %61, i32 %58, i32 %63
  %65 = bitcast i32 %64 to float
  %66 = fcmp oeq float %41, 0.000000e+00
  %67 = select i1 %66, float %41, float %65
  br label %_Z11_cl_frfrexpf.exit2

_Z11_cl_frfrexpf.exit2:                           ; preds = %55, %_Z12_cl_expfrexpf.exit1
  %68 = phi float [ %67, %55 ], [ %7, %_Z12_cl_expfrexpf.exit1 ]
  %69 = fmul float %68, 2.000000e+00
  %70 = sub nsw i32 %22, %53
  %71 = fdiv float 1.000000e+00, %69, !fpmath !86
  %72 = icmp sgt i32 %70, 12
  br i1 %72, label %.preheader, label %.loopexit

.preheader:                                       ; preds = %_Z11_cl_frfrexpf.exit2
  %73 = bitcast float %69 to i32
  %74 = and i32 %73, -4096
  %75 = bitcast i32 %74 to float
  %76 = fsub float %69, %75
  br label %77

77:                                               ; preds = %_Z8_cl_rintf.exit, %.preheader
  %78 = phi float [ %131, %_Z8_cl_rintf.exit ], [ %38, %.preheader ]
  %79 = phi i32 [ %132, %_Z8_cl_rintf.exit ], [ %70, %.preheader ]
  %80 = fmul float %71, %78
  %81 = fadd float %80, 5.000000e-01
  %82 = fptosi float %81 to i32
  %83 = sitofp i32 %82 to float
  %84 = fsub float %81, %83
  %85 = fcmp olt float %84, 0.000000e+00
  br i1 %85, label %91, label %86

86:                                               ; preds = %77
  %87 = and i32 %82, 1
  %88 = icmp ne i32 %87, 0
  %89 = fcmp oeq float %84, 0.000000e+00
  %90 = select i1 %89, i1 %88, i1 false
  br i1 %90, label %91, label %93

91:                                               ; preds = %86, %77
  %92 = fadd float %84, 1.000000e+00
  br label %93

93:                                               ; preds = %91, %86
  %94 = phi float [ %92, %91 ], [ %84, %86 ]
  %95 = fcmp oeq float %80, 0x3FE0000020000000
  %96 = select i1 %95, float 0.000000e+00, float %81
  %97 = fcmp une float %80, 0x7FF0000000000000
  %98 = fcmp une float %80, 0xFFF0000000000000
  %99 = and i1 %97, %98
  br i1 %99, label %100, label %_Z8_cl_rintf.exit

100:                                              ; preds = %93
  %101 = bitcast float %80 to i32
  %102 = and i32 %101, 2147483647
  %103 = bitcast i32 %102 to float
  %104 = fcmp ult float %103, 0x4160000000000000
  br i1 %104, label %105, label %_Z8_cl_rintf.exit

105:                                              ; preds = %100
  %106 = fsub float %96, %94
  %107 = bitcast float %106 to i32
  %108 = and i32 %107, 2147483647
  %109 = and i32 %101, -2147483648
  %110 = or i32 %108, %109
  %111 = bitcast i32 %110 to float
  br label %_Z8_cl_rintf.exit

_Z8_cl_rintf.exit:                                ; preds = %105, %100, %93
  %112 = phi float [ %111, %105 ], [ %80, %100 ], [ %80, %93 ]
  %113 = bitcast float %112 to i32
  %114 = and i32 %113, -4096
  %115 = bitcast i32 %114 to float
  %116 = fsub float %112, %115
  %117 = fmul float %69, %112
  %118 = fneg float %117
  %119 = tail call float @llvm.fma.f32(float %115, float %75, float %118) #11
  %120 = tail call float @llvm.fma.f32(float %115, float %76, float %119) #11
  %121 = tail call float @llvm.fma.f32(float %116, float %75, float %120) #11
  %122 = tail call float @llvm.fma.f32(float %116, float %76, float %121) #11
  %123 = fsub float %78, %117
  %124 = fsub float %78, %123
  %125 = fsub float %124, %117
  %126 = fsub float %125, %122
  %127 = fadd float %123, %126
  %128 = fcmp olt float %127, 0.000000e+00
  %129 = select i1 %128, float %69, float -0.000000e+00
  %130 = fadd float %127, %129
  %131 = fmul float %130, 4.096000e+03
  %132 = add nsw i32 %79, -12
  %133 = icmp ugt i32 %79, 24
  br i1 %133, label %77, label %.loopexit.loopexit

.loopexit.loopexit:                               ; preds = %_Z8_cl_rintf.exit
  %.lcssa29 = phi float [ %131, %_Z8_cl_rintf.exit ]
  %.lcssa = phi i32 [ %132, %_Z8_cl_rintf.exit ]
  br label %.loopexit

.loopexit:                                        ; preds = %.loopexit.loopexit, %_Z11_cl_frfrexpf.exit2
  %134 = phi i32 [ %70, %_Z11_cl_frfrexpf.exit2 ], [ %.lcssa, %.loopexit.loopexit ]
  %135 = phi float [ %38, %_Z11_cl_frfrexpf.exit2 ], [ %.lcssa29, %.loopexit.loopexit ]
  %136 = add nsw i32 %134, -11
  %137 = ashr i32 %136, 2
  %138 = lshr i32 %136, 31
  %139 = add nsw i32 %137, %138
  %140 = add nsw i32 %134, 38
  %141 = icmp ult i32 %140, 99
  %142 = select i1 %141, i32 0, i32 %139
  %143 = mul nsw i32 %142, -4
  %144 = add nsw i32 %143, %136
  %145 = shl nsw i32 %142, 23
  %146 = add i32 %145, 1065353216
  %147 = bitcast i32 %146 to float
  %148 = shl i32 %144, 23
  %149 = add i32 %148, 1065353216
  %150 = bitcast i32 %149 to float
  %151 = fmul float %135, %150
  %152 = fmul float %151, %147
  %153 = fmul float %152, %147
  %154 = fmul float %153, %147
  %155 = fmul float %154, %147
  %156 = fmul float %71, %155
  %157 = fadd float %156, 5.000000e-01
  %158 = fptosi float %157 to i32
  %159 = sitofp i32 %158 to float
  %160 = fsub float %157, %159
  %161 = fcmp olt float %160, 0.000000e+00
  br i1 %161, label %167, label %162

162:                                              ; preds = %.loopexit
  %163 = and i32 %158, 1
  %164 = icmp ne i32 %163, 0
  %165 = fcmp oeq float %160, 0.000000e+00
  %166 = select i1 %165, i1 %164, i1 false
  br i1 %166, label %167, label %169

167:                                              ; preds = %162, %.loopexit
  %168 = fadd float %160, 1.000000e+00
  br label %169

169:                                              ; preds = %167, %162
  %170 = phi float [ %168, %167 ], [ %160, %162 ]
  %171 = fcmp oeq float %156, 0x3FE0000020000000
  %172 = select i1 %171, float 0.000000e+00, float %157
  %173 = fcmp une float %156, 0x7FF0000000000000
  %174 = fcmp une float %156, 0xFFF0000000000000
  %175 = and i1 %173, %174
  br i1 %175, label %176, label %_Z8_cl_rintf.exit3

176:                                              ; preds = %169
  %177 = bitcast float %156 to i32
  %178 = and i32 %177, 2147483647
  %179 = bitcast i32 %178 to float
  %180 = fcmp ult float %179, 0x4160000000000000
  br i1 %180, label %181, label %_Z8_cl_rintf.exit3

181:                                              ; preds = %176
  %182 = fsub float %172, %170
  %183 = bitcast float %182 to i32
  %184 = and i32 %183, 2147483647
  %185 = and i32 %177, -2147483648
  %186 = or i32 %184, %185
  %187 = bitcast i32 %186 to float
  br label %_Z8_cl_rintf.exit3

_Z8_cl_rintf.exit3:                               ; preds = %181, %176, %169
  %188 = phi float [ %187, %181 ], [ %156, %176 ], [ %156, %169 ]
  %189 = bitcast float %188 to i32
  %190 = and i32 %189, -4096
  %191 = bitcast i32 %190 to float
  %192 = fsub float %188, %191
  %193 = bitcast float %69 to i32
  %194 = and i32 %193, -4096
  %195 = bitcast i32 %194 to float
  %196 = fsub float %69, %195
  %197 = fmul float %69, %188
  %198 = fneg float %197
  %199 = tail call float @llvm.fma.f32(float %191, float %195, float %198) #11
  %200 = tail call float @llvm.fma.f32(float %191, float %196, float %199) #11
  %201 = tail call float @llvm.fma.f32(float %192, float %195, float %200) #11
  %202 = tail call float @llvm.fma.f32(float %192, float %196, float %201) #11
  %203 = fsub float %155, %197
  %204 = fsub float %155, %203
  %205 = fsub float %204, %197
  %206 = fsub float %205, %202
  %207 = fadd float %203, %206
  %208 = add nsw i32 %53, -1
  %209 = ashr i32 %208, 2
  %210 = lshr i32 %208, 31
  %211 = add nsw i32 %209, %210
  %212 = add nsw i32 %53, 48
  %213 = icmp ult i32 %212, 99
  %214 = select i1 %213, i32 0, i32 %211
  %215 = shl nsw i32 %214, 23
  %216 = add i32 %215, 1065353216
  %217 = bitcast i32 %216 to float
  %218 = and i32 %2, -2147483648
  %219 = shl i32 %5, 1
  %220 = icmp ugt i32 %219, -16777216
  %.not = icmp eq i32 %3, 2139095040
  %or.cond = or i1 %.not, %220
  br i1 %or.cond, label %247, label %221

221:                                              ; preds = %_Z8_cl_rintf.exit3
  %222 = fcmp oeq float %4, %7
  %223 = bitcast i32 %218 to float
  %224 = select i1 %222, float %223, float %0
  %225 = fcmp olt float %207, 0.000000e+00
  %226 = select i1 %225, float %69, float -0.000000e+00
  %227 = fadd float %207, %226
  %228 = mul nsw i32 %214, -4
  %229 = add nsw i32 %228, %208
  %230 = shl i32 %229, 23
  %231 = add i32 %230, 1065353216
  %232 = bitcast i32 %231 to float
  %233 = fmul float %227, %232
  %234 = fmul float %233, %217
  %235 = fmul float %234, %217
  %236 = fmul float %235, %217
  %237 = fmul float %236, %217
  %238 = bitcast float %237 to i32
  %239 = xor i32 %218, %238
  %240 = bitcast i32 %239 to float
  %241 = shl i32 %2, 1
  %242 = icmp ugt i32 %241, -16777216
  %243 = fcmp oeq float %1, 0.000000e+00
  %244 = or i1 %243, %242
  %245 = fcmp ogt float %4, %7
  %246 = select i1 %245, float %240, float %224
  br i1 %244, label %247, label %.r_exit

247:                                              ; preds = %221, %_Z8_cl_rintf.exit3
  br label %.r_exit

.r_exit:                                          ; preds = %247, %221
  %248 = phi float [ %246, %221 ], [ 0x7FF8000000000000, %247 ]
  ret float %248
}

attributes #7 = { nofree nosync nounwind readnone speculatable willreturn }
attributes #8 = { mustprogress nofree norecurse nosync nounwind readnone uwtable willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="i686" "target-features"="+cx8,+x87" }
attributes #11 = { nounwind }

!86 = !{float 2.500000e+00}
-------------- next part --------------
#include <stdio.h>
#include <math.h>

float _Z8_cl_fmodff(float, float);

int main()
{
  for (int i = 0; i < 10; ++i) {
    float a = ((float)i) / 10.;
    float b = ((float)i) / 45.2 + 0.1;
    float c = _Z8_cl_fmodff(a, b);
    float d = fmodf(a, b);
    float e = d - c;
    float r = e / b;
    printf("a=%g b=%.8g c=%.8g d=%.8g abserr=%g relerr=%g%s\n",
           a, b, c, d, e, r, fabs(r) >= 1e-7 ? " FAIL" : "");
  }
}
-------------- next part --------------
CFLAGS	 = -O2 --target=i386-unknown-linux-gnu -march=i686

all: test_pocl_cl_fmod_llvm14
all: test_pocl_cl_fmod_llvm15

test_pocl_cl_fmod_llvm%: pocl_cl_fmod.ll main.c
	clang-$* $(CFLAGS) $^ -o $@ -lm

clean:
	$(RM) test_pocl_cl_fmod_llvm??