loop-rotation-vs-common-code-hoisting.ll 10 KB
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -O3 -rotation-max-header-size=0 -S < %s                    | FileCheck %s --check-prefixes=HOIST,THR0,FALLBACK0
; RUN: opt -passes='default<O3>' -rotation-max-header-size=0 -S < %s  | FileCheck %s --check-prefixes=HOIST,THR0,FALLBACK1

; RUN: opt -O3 -rotation-max-header-size=1 -S < %s                    | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK2
; RUN: opt -passes='default<O3>' -rotation-max-header-size=1 -S < %s  | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK3

; RUN: opt -O3 -rotation-max-header-size=2 -S < %s                    | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_OLDPM,FALLBACK4
; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s  | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_NEWPM,FALLBACK5

; RUN: opt -O3 -rotation-max-header-size=3 -S < %s                    | FileCheck %s --check-prefixes=ROTATE,ROTATE_OLDPM,FALLBACK6
; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s  | FileCheck %s --check-prefixes=ROTATE,ROTATE_NEWPM,FALLBACK7

; This example is produced from a very basic C code:
;
;   void f0();
;   void f1();
;   void f2();
;
;   void loop(int width) {
;       if(width < 1)
;           return;
;       for(int i = 0; i < width - 1; ++i) {
;           f0();
;           f1();
;       }
;       f0();
;       f2();
;   }

; We have a choice here. We can either
; * hoist the f0() call into loop header,
;   * which potentially makes loop rotation unprofitable since loop header might
;     have grown above certain threshold, and such unrotated loops will be
;     ignored by LoopVectorizer, preventing vectorization
;   * or loop rotation will succeed, resulting in some weird PHIs that will also
;     harm vectorization
; * or not hoist f0() call before performing loop rotation,
;   at the cost of potential code bloat and/or potentially successfully rotating
;   the loops, vectorizing them at the cost of compile time.

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"

declare void @f0()
declare void @f1()
declare void @f2()

declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)

define void @_Z4loopi(i32 %width) {
; HOIST-LABEL: @_Z4loopi(
; HOIST-NEXT:  entry:
; HOIST-NEXT:    [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
; HOIST-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
; HOIST:       for.cond.preheader:
; HOIST-NEXT:    [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
; HOIST-NEXT:    br label [[FOR_COND:%.*]]
; HOIST:       for.cond:
; HOIST-NEXT:    [[I_0:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ]
; HOIST-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[TMP0]]
; HOIST-NEXT:    tail call void @f0()
; HOIST-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
; HOIST:       for.cond.cleanup:
; HOIST-NEXT:    tail call void @f2()
; HOIST-NEXT:    br label [[RETURN]]
; HOIST:       for.body:
; HOIST-NEXT:    tail call void @f1()
; HOIST-NEXT:    [[INC]] = add nuw i32 [[I_0]], 1
; HOIST-NEXT:    br label [[FOR_COND]]
; HOIST:       return:
; HOIST-NEXT:    ret void
;
; ROTATED_LATER_OLDPM-LABEL: @_Z4loopi(
; ROTATED_LATER_OLDPM-NEXT:  entry:
; ROTATED_LATER_OLDPM-NEXT:    [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
; ROTATED_LATER_OLDPM-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
; ROTATED_LATER_OLDPM:       for.cond.preheader:
; ROTATED_LATER_OLDPM-NEXT:    [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
; ROTATED_LATER_OLDPM-NEXT:    [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0
; ROTATED_LATER_OLDPM-NEXT:    br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
; ROTATED_LATER_OLDPM:       for.cond.cleanup:
; ROTATED_LATER_OLDPM-NEXT:    tail call void @f0()
; ROTATED_LATER_OLDPM-NEXT:    tail call void @f2()
; ROTATED_LATER_OLDPM-NEXT:    br label [[RETURN]]
; ROTATED_LATER_OLDPM:       for.body:
; ROTATED_LATER_OLDPM-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_COND_PREHEADER]] ]
; ROTATED_LATER_OLDPM-NEXT:    tail call void @f0()
; ROTATED_LATER_OLDPM-NEXT:    tail call void @f1()
; ROTATED_LATER_OLDPM-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
; ROTATED_LATER_OLDPM-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
; ROTATED_LATER_OLDPM-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
; ROTATED_LATER_OLDPM:       return:
; ROTATED_LATER_OLDPM-NEXT:    ret void
;
; ROTATED_LATER_NEWPM-LABEL: @_Z4loopi(
; ROTATED_LATER_NEWPM-NEXT:  entry:
; ROTATED_LATER_NEWPM-NEXT:    [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
; ROTATED_LATER_NEWPM-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
; ROTATED_LATER_NEWPM:       for.cond.preheader:
; ROTATED_LATER_NEWPM-NEXT:    [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
; ROTATED_LATER_NEWPM-NEXT:    [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0
; ROTATED_LATER_NEWPM-NEXT:    br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE:%.*]]
; ROTATED_LATER_NEWPM:       for.cond.preheader.for.body_crit_edge:
; ROTATED_LATER_NEWPM-NEXT:    [[INC_1:%.*]] = add nuw i32 0, 1
; ROTATED_LATER_NEWPM-NEXT:    br label [[FOR_BODY:%.*]]
; ROTATED_LATER_NEWPM:       for.cond.cleanup:
; ROTATED_LATER_NEWPM-NEXT:    tail call void @f0()
; ROTATED_LATER_NEWPM-NEXT:    tail call void @f2()
; ROTATED_LATER_NEWPM-NEXT:    br label [[RETURN]]
; ROTATED_LATER_NEWPM:       for.body:
; ROTATED_LATER_NEWPM-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE]] ]
; ROTATED_LATER_NEWPM-NEXT:    tail call void @f0()
; ROTATED_LATER_NEWPM-NEXT:    tail call void @f1()
; ROTATED_LATER_NEWPM-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]]
; ROTATED_LATER_NEWPM-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]]
; ROTATED_LATER_NEWPM:       for.body.for.body_crit_edge:
; ROTATED_LATER_NEWPM-NEXT:    [[INC_0]] = add nuw i32 [[INC_PHI]], 1
; ROTATED_LATER_NEWPM-NEXT:    br label [[FOR_BODY]]
; ROTATED_LATER_NEWPM:       return:
; ROTATED_LATER_NEWPM-NEXT:    ret void
;
; ROTATE_OLDPM-LABEL: @_Z4loopi(
; ROTATE_OLDPM-NEXT:  entry:
; ROTATE_OLDPM-NEXT:    [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
; ROTATE_OLDPM-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
; ROTATE_OLDPM:       for.cond.preheader:
; ROTATE_OLDPM-NEXT:    [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
; ROTATE_OLDPM-NEXT:    br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; ROTATE_OLDPM:       for.body.preheader:
; ROTATE_OLDPM-NEXT:    [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
; ROTATE_OLDPM-NEXT:    br label [[FOR_BODY:%.*]]
; ROTATE_OLDPM:       for.cond.cleanup:
; ROTATE_OLDPM-NEXT:    tail call void @f0()
; ROTATE_OLDPM-NEXT:    tail call void @f2()
; ROTATE_OLDPM-NEXT:    br label [[RETURN]]
; ROTATE_OLDPM:       for.body:
; ROTATE_OLDPM-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; ROTATE_OLDPM-NEXT:    tail call void @f0()
; ROTATE_OLDPM-NEXT:    tail call void @f1()
; ROTATE_OLDPM-NEXT:    [[INC]] = add nuw nsw i32 [[I_04]], 1
; ROTATE_OLDPM-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
; ROTATE_OLDPM-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
; ROTATE_OLDPM:       return:
; ROTATE_OLDPM-NEXT:    ret void
;
; ROTATE_NEWPM-LABEL: @_Z4loopi(
; ROTATE_NEWPM-NEXT:  entry:
; ROTATE_NEWPM-NEXT:    [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
; ROTATE_NEWPM-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
; ROTATE_NEWPM:       for.cond.preheader:
; ROTATE_NEWPM-NEXT:    [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
; ROTATE_NEWPM-NEXT:    br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; ROTATE_NEWPM:       for.body.preheader:
; ROTATE_NEWPM-NEXT:    [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
; ROTATE_NEWPM-NEXT:    [[INC_1:%.*]] = add nuw nsw i32 0, 1
; ROTATE_NEWPM-NEXT:    br label [[FOR_BODY:%.*]]
; ROTATE_NEWPM:       for.cond.cleanup:
; ROTATE_NEWPM-NEXT:    tail call void @f0()
; ROTATE_NEWPM-NEXT:    tail call void @f2()
; ROTATE_NEWPM-NEXT:    br label [[RETURN]]
; ROTATE_NEWPM:       for.body:
; ROTATE_NEWPM-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_BODY_PREHEADER]] ]
; ROTATE_NEWPM-NEXT:    tail call void @f0()
; ROTATE_NEWPM-NEXT:    tail call void @f1()
; ROTATE_NEWPM-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]]
; ROTATE_NEWPM-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]]
; ROTATE_NEWPM:       for.body.for.body_crit_edge:
; ROTATE_NEWPM-NEXT:    [[INC_0]] = add nuw nsw i32 [[INC_PHI]], 1
; ROTATE_NEWPM-NEXT:    br label [[FOR_BODY]]
; ROTATE_NEWPM:       return:
; ROTATE_NEWPM-NEXT:    ret void
;
entry:
  %width.addr = alloca i32, align 4
  %i = alloca i32, align 4
  store i32 %width, i32* %width.addr, align 4
  %i1 = load i32, i32* %width.addr, align 4
  %cmp = icmp slt i32 %i1, 1
  br i1 %cmp, label %if.then, label %if.end

if.then:
  br label %return

if.end:
  %i2 = bitcast i32* %i to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* %i2)
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:
  %i3 = load i32, i32* %i, align 4
  %i4 = load i32, i32* %width.addr, align 4
  %sub = sub nsw i32 %i4, 1
  %cmp1 = icmp slt i32 %i3, %sub
  br i1 %cmp1, label %for.body, label %for.cond.cleanup

for.cond.cleanup:
  %i5 = bitcast i32* %i to i8*
  call void @llvm.lifetime.end.p0i8(i64 4, i8* %i5)
  br label %for.end

for.body:
  call void @f0()
  call void @f1()
  br label %for.inc

for.inc:
  %i6 = load i32, i32* %i, align 4
  %inc = add nsw i32 %i6, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond

for.end:
  call void @f0()
  call void @f2()
  br label %return

return:
  ret void
}