summaryrefslogtreecommitdiff
path: root/lib/ccan/ilog/ilog.c
blob: f42066c5777bbe379aa0d1c92acc670dc51027fb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/*(C) Timothy B. Terriberry (tterribe@xiph.org) 2001-2009 LGPL (v2 or later).
 * See LICENSE file for details. */
#include "ilog.h"
#include <limits.h>

/*The fastest fallback strategy for platforms with fast multiplication appears
   to be based on de Bruijn sequences~\cite{LP98}.
  Tests confirmed this to be true even on an ARM11, where it is actually faster
   than using the native clz instruction.
  Define ILOG_NODEBRUIJN to use a simpler fallback on platforms where
   multiplication or table lookups are too expensive.

  @UNPUBLISHED{LP98,
    author="Charles E. Leiserson and Harald Prokop",
    title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word",
    month=Jun,
    year=1998,
    note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
  }*/
static UNNEEDED const unsigned char DEBRUIJN_IDX32[32]={
   0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
  31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
};

/* We always compile these in, in case someone takes address of function. */
#undef ilog32_nz
#undef ilog32
#undef ilog64_nz
#undef ilog64

int ilog32(uint32_t _v){
/*On a Pentium M, this branchless version tested as the fastest version without
   multiplications on 1,000,000,000 random 32-bit integers, edging out a
   similar version with branches, and a 256-entry LUT version.*/
# if defined(ILOG_NODEBRUIJN)
  int ret;
  int m;
  ret=_v>0;
  m=(_v>0xFFFFU)<<4;
  _v>>=m;
  ret|=m;
  m=(_v>0xFFU)<<3;
  _v>>=m;
  ret|=m;
  m=(_v>0xFU)<<2;
  _v>>=m;
  ret|=m;
  m=(_v>3)<<1;
  _v>>=m;
  ret|=m;
  ret+=_v>1;
  return ret;
/*This de Bruijn sequence version is faster if you have a fast multiplier.*/
# else
  int ret;
  ret=_v>0;
  _v|=_v>>1;
  _v|=_v>>2;
  _v|=_v>>4;
  _v|=_v>>8;
  _v|=_v>>16;
  _v=(_v>>1)+1;
  ret+=DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
  return ret;
# endif
}

int ilog32_nz(uint32_t _v)
{
  return ilog32(_v);
}

int ilog64(uint64_t _v){
# if defined(ILOG_NODEBRUIJN)
  uint32_t v;
  int      ret;
  int      m;
  ret=_v>0;
  m=(_v>0xFFFFFFFFU)<<5;
  v=(uint32_t)(_v>>m);
  ret|=m;
  m=(v>0xFFFFU)<<4;
  v>>=m;
  ret|=m;
  m=(v>0xFFU)<<3;
  v>>=m;
  ret|=m;
  m=(v>0xFU)<<2;
  v>>=m;
  ret|=m;
  m=(v>3)<<1;
  v>>=m;
  ret|=m;
  ret+=v>1;
  return ret;
# else
/*If we don't have a 64-bit word, split it into two 32-bit halves.*/
#  if LONG_MAX<9223372036854775807LL
  uint32_t v;
  int      ret;
  int      m;
  ret=_v>0;
  m=(_v>0xFFFFFFFFU)<<5;
  v=(uint32_t)(_v>>m);
  ret|=m;
  v|=v>>1;
  v|=v>>2;
  v|=v>>4;
  v|=v>>8;
  v|=v>>16;
  v=(v>>1)+1;
  ret+=DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
  return ret;
/*Otherwise do it in one 64-bit operation.*/
#  else
  static const unsigned char DEBRUIJN_IDX64[64]={
     0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
     5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
    63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
    62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
  };
  int ret;
  ret=_v>0;
  _v|=_v>>1;
  _v|=_v>>2;
  _v|=_v>>4;
  _v|=_v>>8;
  _v|=_v>>16;
  _v|=_v>>32;
  _v=(_v>>1)+1;
  ret+=DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
  return ret;
#  endif
# endif
}

int ilog64_nz(uint64_t _v)
{
  return ilog64(_v);
}