KGraph
 All Classes Namespaces Functions Pages
kgraph-data.h
1 #ifndef WDONG_KGRAPH_DATA
2 #define WDONG_KGRAPH_DATA
3 
4 #include <cmath>
5 #include <cstring>
6 #include <malloc.h>
7 #include <vector>
8 #include <fstream>
9 #include <stdexcept>
10 #include <boost/assert.hpp>
11 
12 #ifdef __GNUC__
13 #ifdef __AVX__
14 #define KGRAPH_MATRIX_ALIGN 32
15 #else
16 #ifdef __SSE2__
17 #define KGRAPH_MATRIX_ALIGN 16
18 #else
19 #define KGRAPH_MATRIX_ALIGN 4
20 #endif
21 #endif
22 #endif
23 
24 namespace kgraph {
25 
27 
29  extern float float_l2sqr_avx (float const *t1, float const *t2, unsigned dim);
31  extern float float_l2sqr_sse2 (float const *t1, float const *t2, unsigned dim);
32  extern float float_l2sqr_sse2 (float const *, unsigned dim);
33  extern float float_dot_sse2 (float const *, float const *, unsigned dim);
35  extern float uint8_l2sqr_sse2 (uint8_t const *t1, uint8_t const *t2, unsigned dim);
36 
37  extern float float_l2sqr (float const *, float const *, unsigned dim);
38  extern float float_l2sqr (float const *, unsigned dim);
39  extern float float_dot (float const *, float const *, unsigned dim);
40 
41 
42  using std::vector;
43 
45  namespace metric {
47  struct l2sqr {
48  template <typename T>
50  static float apply (T const *t1, T const *t2, unsigned dim) {
51  float r = 0;
52  for (unsigned i = 0; i < dim; ++i) {
53  float v = float(t1[i]) - float(t2[i]);
54  v *= v;
55  r += v;
56  }
57  return r;
58  }
59 
61  template <typename T>
62  static float dot (T const *t1, T const *t2, unsigned dim) {
63  float r = 0;
64  for (unsigned i = 0; i < dim; ++i) {
65  r += float(t1[i]) *float(t2[i]);
66  }
67  return r;
68  }
69 
71  template <typename T>
72  static float norm2 (T const *t1, unsigned dim) {
73  float r = 0;
74  for (unsigned i = 0; i < dim; ++i) {
75  float v = float(t1[i]);
76  v *= v;
77  r += v;
78  }
79  return r;
80  }
81  };
82 
83  struct l2 {
84  template <typename T>
85  static float apply (T const *t1, T const *t2, unsigned dim) {
86  return sqrt(l2sqr::apply<T>(t1, t2, dim));
87  }
88  };
89  }
90 
92  template <typename T, unsigned A = KGRAPH_MATRIX_ALIGN>
93  class Matrix {
94  unsigned col;
95  unsigned row;
96  size_t stride;
97  char *data;
98 
99  void reset (unsigned r, unsigned c) {
100  row = r;
101  col = c;
102  stride = (sizeof(T) * c + A - 1) / A * A;
103  /*
104  data.resize(row * stride);
105  */
106  if (data) free(data);
107  data = (char *)memalign(A, row * stride); // SSE instruction needs data to be aligned
108  if (!data) throw runtime_error("memalign");
109  }
110  public:
111  Matrix (): col(0), row(0), stride(0), data(0) {}
112  Matrix (unsigned r, unsigned c): data(0) {
113  reset(r, c);
114  }
115  ~Matrix () {
116  if (data) free(data);
117  }
118  unsigned size () const {
119  return row;
120  }
121  unsigned dim () const {
122  return col;
123  }
124  size_t step () const {
125  return stride;
126  }
127  void resize (unsigned r, unsigned c) {
128  reset(r, c);
129  }
130  T const *operator [] (unsigned i) const {
131  return reinterpret_cast<T const *>(&data[stride * i]);
132  }
133  T *operator [] (unsigned i) {
134  return reinterpret_cast<T *>(&data[stride * i]);
135  }
136  void zero () {
137  memset(data, 0, row * stride);
138  }
139 
140  void normalize2 () {
141 #pragma omp parallel for
142  for (unsigned i = 0; i < row; ++i) {
143  T *p = operator[](i);
144  double sum = metric::l2sqr::norm2(p, col);
145  sum = std::sqrt(sum);
146  for (unsigned j = 0; j < col; ++j) {
147  p[j] /= sum;
148  }
149  }
150  }
151 
152  void load (const std::string &path, unsigned dim, unsigned skip = 0, unsigned gap = 0) {
153  std::ifstream is(path.c_str(), std::ios::binary);
154  if (!is) throw io_error(path);
155  is.seekg(0, std::ios::end);
156  size_t size = is.tellg();
157  size -= skip;
158  unsigned line = sizeof(T) * dim + gap;
159  unsigned N = size / line;
160  reset(N, dim);
161  zero();
162  is.seekg(skip, std::ios::beg);
163  for (unsigned i = 0; i < N; ++i) {
164  is.read(&data[stride * i], sizeof(T) * dim);
165  is.seekg(gap, std::ios::cur);
166  }
167  if (!is) throw io_error(path);
168  }
169 
170  void load_lshkit (std::string const &path) {
171  static const unsigned LSHKIT_HEADER = 3;
172  std::ifstream is(path.c_str(), std::ios::binary);
173  unsigned header[LSHKIT_HEADER]; /* entry size, row, col */
174  is.read((char *)header, sizeof header);
175  if (!is) throw io_error(path);
176  if (header[0] != sizeof(T)) throw io_error(path);
177  is.close();
178  unsigned D = header[2];
179  unsigned skip = LSHKIT_HEADER * sizeof(unsigned);
180  unsigned gap = 0;
181  load(path, D, skip, gap);
182  }
183 
184  void save_lshkit (std::string const &path) {
185  std::ofstream os(path.c_str(), std::ios::binary);
186  unsigned header[3];
187  assert(sizeof header == 3*4);
188  header[0] = sizeof(T);
189  header[1] = row;
190  header[2] = col;
191  os.write((const char *)header, sizeof(header));
192  for (unsigned i = 0; i < row; ++i) {
193  os.write(&data[stride * i], sizeof(T) * col);
194  }
195  }
196  };
197 
199  template <typename DATA_TYPE, unsigned A = KGRAPH_MATRIX_ALIGN>
200  class MatrixProxy {
201  unsigned rows;
202  unsigned cols; // # elements, not bytes, in a row,
203  size_t stride; // # bytes in a row, >= cols * sizeof(element)
204  uint8_t const *data;
205  public:
206  MatrixProxy (Matrix<DATA_TYPE> const &m)
207  : rows(m.size()), cols(m.dim()), stride(m.step()), data(reinterpret_cast<uint8_t const *>(m[0])) {
208  }
209 
210 #ifndef __AVX__
211 #ifdef FLANN_DATASET_H_
212  MatrixProxy (flann::Matrix<DATA_TYPE> const &m)
214  : rows(m.rows), cols(m.cols), stride(m.stride), data(m.data) {
215  if (stride % A) throw invalid_argument("bad alignment");
216  }
217 #endif
218 #ifdef __OPENCV_CORE_HPP__
219  MatrixProxy (cv::Mat const &m)
221  : rows(m.rows), cols(m.cols), stride(m.step), data(m.data) {
222  if (stride % A) throw invalid_argument("bad alignment");
223  }
224 #endif
225 #ifdef NPY_NDARRAYOBJECT_H
226  MatrixProxy (PyArrayObject *obj) {
228  if (!obj || (obj->nd != 2)) throw invalid_argument("bad array shape");
229  rows = obj->dimensions[0];
230  cols = obj->dimensions[1];
231  stride = obj->strides[0];
232  data = reinterpret_cast<uint8_t const *>(obj->data);
233  if (obj->descr->elsize != sizeof(DATA_TYPE)) throw invalid_argument("bad data type size");
234  if (stride % A) throw invalid_argument("bad alignment");
235  if (!(stride >= cols * sizeof(DATA_TYPE))) throw invalid_argument("bad stride");
236  }
237 #endif
238 #endif
239  unsigned size () const {
240  return rows;
241  }
242  unsigned dim () const {
243  return cols;
244  }
245  DATA_TYPE const *operator [] (unsigned i) const {
246  return reinterpret_cast<DATA_TYPE const *>(data + stride * i);
247  }
248  DATA_TYPE *operator [] (unsigned i) {
249  return const_cast<DATA_TYPE *>(reinterpret_cast<DATA_TYPE const *>(data + stride * i));
250  }
251  };
252 
254 
257  template <typename DATA_TYPE, typename DIST_TYPE>
260  public:
263  DATA_TYPE const *query;
264  public:
265  SearchOracle (MatrixProxy<DATA_TYPE> const &p, DATA_TYPE const *q): proxy(p), query(q) {
266  }
267  virtual unsigned size () const {
268  return proxy.size();
269  }
270  virtual float operator () (unsigned i) const {
271  return DIST_TYPE::apply(proxy[i], query, proxy.dim());
272  }
273  };
274  template <typename MATRIX_TYPE>
275  MatrixOracle (MATRIX_TYPE const &m): proxy(m) {
276  }
277  virtual unsigned size () const {
278  return proxy.size();
279  }
280  virtual float operator () (unsigned i, unsigned j) const {
281  return DIST_TYPE::apply(proxy[i], proxy[j], proxy.dim());
282  }
283  SearchOracle query (DATA_TYPE const *query) const {
284  return SearchOracle(proxy, query);
285  }
286  };
287 
288  inline float AverageRecall (Matrix<float> const &gs, Matrix<float> const &result, unsigned K = 0) {
289  if (K == 0) {
290  K = result.dim();
291  }
292  if (!(gs.dim() >= K)) throw invalid_argument("gs.dim() >= K");
293  if (!(result.dim() >= K)) throw invalid_argument("result.dim() >= K");
294  if (!(gs.size() >= result.size())) throw invalid_argument("gs.size() > result.size()");
295  float sum = 0;
296  for (unsigned i = 0; i < result.size(); ++i) {
297  float const *gs_row = gs[i];
298  float const *re_row = result[i];
299  // compare
300  unsigned found = 0;
301  unsigned gs_n = 0;
302  unsigned re_n = 0;
303  while ((gs_n < K) && (re_n < K)) {
304  if (gs_row[gs_n] < re_row[re_n]) {
305  ++gs_n;
306  }
307  else if (gs_row[gs_n] == re_row[re_n]) {
308  ++found;
309  ++gs_n;
310  ++re_n;
311  }
312  else {
313  throw runtime_error("distance is unstable");
314  }
315  }
316  sum += float(found) / K;
317  }
318  return sum / result.size();
319  }
320 
321 
322 }
323 
324 #ifndef KGRAPH_NO_VECTORIZE
325 #ifdef __GNUC__
326 #ifdef __AVX__
327 #if 0
328 namespace kgraph { namespace metric {
329  template <>
330  inline float l2sqr::apply<float> (float const *t1, float const *t2, unsigned dim) {
331  return float_l2sqr_avx(t1, t2, dim);
332  }
333 }}
334 #endif
335 #else
336 #ifdef __SSE2__
337 namespace kgraph { namespace metric {
338  template <>
339  inline float l2sqr::apply<float> (float const *t1, float const *t2, unsigned dim) {
340  return float_l2sqr_sse2(t1, t2, dim);
341  }
342  template <>
343  inline float l2sqr::dot<float> (float const *t1, float const *t2, unsigned dim) {
344  return float_dot_sse2(t1, t2, dim);
345  }
346  template <>
347  inline float l2sqr::norm2<float> (float const *t1, unsigned dim) {
348  return float_l2sqr_sse2(t1, dim);
349  }
350  template <>
351  inline float l2sqr::apply<uint8_t> (uint8_t const *t1, uint8_t const *t2, unsigned dim) {
352  return uint8_l2sqr_sse2(t1, t2, dim);
353  }
354 }}
355 #endif
356 #endif
357 #endif
358 #endif
359 
360 
361 
362 #endif
363 
L2 square distance.
Definition: kgraph-data.h:47
static float dot(T const *t1, T const *t2, unsigned dim)
inner product.
Definition: kgraph-data.h:62
Search oracle.
Definition: kgraph.h:66
Definition: kgraph-data.h:83
static float norm2(T const *t1, unsigned dim)
L2 norm.
Definition: kgraph-data.h:72
Index oracle.
Definition: kgraph.h:49
virtual unsigned size() const
Returns the size of the dataset.
Definition: kgraph-data.h:277
Definition: kgraph-data.h:261
virtual float operator()(unsigned i) const
Computes similarity.
Definition: kgraph-data.h:270
Matrix data.
Definition: kgraph-data.h:93
Oracle for Matrix or MatrixProxy.
Definition: kgraph-data.h:258
Matrix proxy to interface with 3rd party libraries (FLANN, OpenCV, NumPy).
Definition: kgraph-data.h:200
virtual float operator()(unsigned i, unsigned j) const
Computes similarity.
Definition: kgraph-data.h:280
static float apply(T const *t1, T const *t2, unsigned dim)
L2 square distance.
Definition: kgraph-data.h:50
virtual unsigned size() const
Returns the size of the dataset.
Definition: kgraph-data.h:267