The Tiny HTM Library
src/tree_gen.c
Go to the documentation of this file.
00001 
00197 #include <assert.h>
00198 #include <ctype.h>
00199 #include <errno.h>
00200 #include <stdarg.h>
00201 #include <stdio.h>
00202 #include <stdlib.h>
00203 #include <string.h>
00204 #include <sys/types.h>
00205 #include <sys/stat.h>
00206 #include <sys/time.h>
00207 #include <sys/mman.h>
00208 #include <fcntl.h>
00209 #include <unistd.h>
00210 #include <getopt.h>
00211 #include <pthread.h>
00212 
00213 #include "tinyhtm/geometry.h"
00214 #include "tinyhtm/htm.h"
00215 #include "tinyhtm/tree.h"
00216 #include "tinyhtm/varint.h"
00217 
00218 #ifdef __cplusplus
00219 extern "C" {
00220 #endif
00221 
00224 /* ================================================================ */
00225 /*                     Utilities and plumbing                       */
00226 /* ================================================================ */
00227 
00228 
00229 /*  Prints an error message to stderr and exits.
00230  */
00231 static void err(const char *fmt, ...)
00232 {
00233     va_list ap;
00234     fprintf(stderr, "ERROR: ");
00235     va_start(ap, fmt);
00236     vfprintf(stderr, fmt, ap);
00237     va_end(ap);
00238     fprintf(stderr, "\n");
00239     fflush(stderr);
00240     exit(EXIT_FAILURE);
00241 }
00242 
00243 /*  Prints a message to stdout and exits.
00244  */
00245 static void msg(const char *fmt, ...)
00246 {
00247     va_list ap;
00248     va_start(ap, fmt);
00249     vprintf(fmt, ap);
00250     va_end(ap);
00251     fflush(stdout);
00252 }
00253 
00254 /*  Returns the number of seconds that have elapsed since the epoch.
00255  */
00256 static double now() {
00257     struct timeval t;
00258     gettimeofday(&t, NULL);
00259     return ((double) t.tv_sec) + ((double) t.tv_usec) / 1.0e6;
00260 }
00261 
00262 
00263 /* ---- Memory and IO related parameters ---- */
00264 
00265 struct mem_params {
00266     size_t memsz;   /* Max memory usage in bytes */
00267     size_t sortsz;  /* Size of blocks operated on by in-memory sorts */
00268     size_t ioblksz; /* Size of IO blocks */
00269     size_t k;       /* Number of merge segments in one multi-way merge pass */
00270 };
00271 
00272 static void mem_params_init(struct mem_params *mem,
00273                             size_t total,
00274                             size_t ioblksz)
00275 {
00276     const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE);
00277     if (total % (2*pagesz) != 0) {
00278         total += 2*pagesz - total % (2*pagesz);
00279     }
00280     if (ioblksz % pagesz != 0) {
00281         ioblksz += pagesz - ioblksz % pagesz;
00282     }
00283     if (total < 6*ioblksz) {
00284         total = 6*ioblksz;
00285     }
00286     mem->memsz = total;
00287     mem->sortsz = total / 2;
00288     mem->ioblksz = ioblksz;
00289     mem->k = (total - 2*ioblksz) / (2*ioblksz);
00290 }
00291 
00292 
00293 /* ---- Asynchronous block writer ---- */
00294 
00295 enum bk_write_state {
00296     BK_WRITE_START = 0,
00297     BK_WRITE_READY,
00298     BK_WRITE_BLK,
00299     BK_WRITE_EXIT,
00300     BK_WRITE_ERR
00301 };
00302 
00303 
00304 struct blk_writer {
00305     size_t n;           /* Number of bytes per block */
00306     size_t i;           /* Number of bytes in current block */
00307     unsigned char *buf; /* Current block pointer */
00308     void *mem;          /* Space for 2 blocks of n tree entries */
00309     size_t memsz;       /* Size of mem in bytes */
00310     int fd;             /* File descriptor of file being written to */
00311     enum bk_write_state state;
00312     unsigned char *wrbuf;
00313     size_t wrbytes;
00314     pthread_attr_t attr;
00315     pthread_mutex_t mtx;
00316     pthread_cond_t cv;
00317     pthread_t thr;
00318 };
00319 
00320 
00321 /*  Writes data blocks in the background.
00322  */
00323 static void * bk_write(void *arg)
00324 {
00325     struct blk_writer *w = (struct blk_writer *) arg;
00326     pthread_mutex_lock(&w->mtx);
00327     while (1) {
00328         unsigned char *buf;
00329         size_t n;
00330         ssize_t b;
00331         /* signal readiness for a command */
00332         w->state = BK_WRITE_READY;
00333         pthread_cond_signal(&w->cv);
00334         /* wait for a command */
00335         pthread_cond_wait(&w->cv, &w->mtx);
00336         if (w->state == BK_WRITE_READY) {
00337             continue; /* nothing to do */
00338         } else if (w->state == BK_WRITE_EXIT) {
00339             break; /* exit background writer thread */
00340         }
00341         /* write a block */
00342         buf = w->wrbuf;
00343         n = w->wrbytes;
00344         pthread_mutex_unlock(&w->mtx);
00345         while (n > 0) {
00346             b = write(w->fd, buf, n);
00347             if (b < 0) {
00348                 if (errno != EINTR) {
00349                     /* error - exit background writer thread */
00350                     pthread_mutex_lock(&w->mtx);
00351                     w->state = BK_WRITE_ERR;
00352                     goto done;
00353                 }
00354             } else {
00355                 buf += b;
00356                 n -= (size_t) b;
00357             }
00358         }
00359         pthread_mutex_lock(&w->mtx);
00360     }
00361 done:
00362     pthread_cond_signal(&w->cv);
00363     pthread_mutex_unlock(&w->mtx);
00364     return NULL;
00365 }
00366 
00367 
00368 /*  Creates a new block writer with the given block size.
00369  */
00370 static struct blk_writer * blk_writer_init(const char * const file,
00371                                            size_t blksz)
00372 {
00373     struct blk_writer *w;
00374     const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE);
00375 
00376     w = (struct blk_writer *) malloc(sizeof(struct blk_writer));
00377     if (w == NULL) {
00378         err("malloc() failed");
00379     }
00380     w->n = blksz;
00381     w->i = 0;
00382     w->fd = open(file, O_CREAT | O_TRUNC | O_APPEND | O_WRONLY,
00383                  S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
00384     if (w->fd == -1) {
00385         err("failed to open file %s for writing", file);
00386     }
00387     w->memsz = 2 * blksz;
00388     if (w->memsz % pagesz != 0) {
00389         w->memsz += pagesz - w->memsz % pagesz;
00390     }
00391     w->mem = mmap(NULL, w->memsz, PROT_READ | PROT_WRITE,
00392                   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
00393     if (w->mem == MAP_FAILED) {
00394         err("write buffer allocation via mmap() failed");
00395     }
00396     w->buf = (unsigned char *) w->mem;
00397     w->state = BK_WRITE_START;
00398     w->wrbuf = NULL;
00399     w->wrbytes = 0;
00400     pthread_attr_init(&w->attr);
00401     pthread_attr_setdetachstate(&w->attr, PTHREAD_CREATE_JOINABLE);
00402     pthread_mutex_init(&w->mtx, NULL);
00403     pthread_cond_init(&w->cv, NULL);
00404     pthread_create(&w->thr, &w->attr, &bk_write, (void *) w);
00405     return w;
00406 }
00407 
00408 
00409 /*  Issues (asynchronous) write of one block.
00410  */
00411 static void blk_writer_issue(struct blk_writer * const w,
00412                              void (*sortfn)(void *, size_t))
00413 {
00414     if (sortfn != NULL) {
00415         (*sortfn)(w->buf, w->i);
00416     }
00417     pthread_mutex_lock(&w->mtx);
00418     /* wait until background writer thread is ready for a command */
00419     while (w->state != BK_WRITE_READY && w->state != BK_WRITE_ERR) {
00420         pthread_cond_wait(&w->cv, &w->mtx);
00421     }
00422     if (w->state == BK_WRITE_ERR) {
00423         pthread_mutex_unlock(&w->mtx);
00424         err("background thread failed to write() disk block");
00425     }
00426     /* issue write for the current block */
00427     w->state = BK_WRITE_BLK;
00428     w->wrbuf = w->buf;
00429     w->wrbytes = w->i;
00430     pthread_cond_signal(&w->cv);
00431     pthread_mutex_unlock(&w->mtx);
00432     /* flip write buffer */
00433     w->i = 0;
00434     if (w->buf == (unsigned char *) w->mem) {
00435         w->buf = ((unsigned char *) w->mem) + w->n;
00436     } else {
00437         w->buf = ((unsigned char *) w->mem);
00438     }
00439 }
00440 
00441 
00442 /*  Closes writer w; all as yet unwritten data is flushed to disk first.
00443 
00444     If a non-NULL sorting function pointer is supplied, unwritten
00445     data is sorted prior to being written.
00446  */
00447 static void blk_writer_close(struct blk_writer * const w,
00448                              void (*sortfn)(void *, size_t))
00449 {
00450     blk_writer_issue(w, sortfn);
00451     pthread_mutex_lock(&w->mtx);
00452     /* wait until background writer thread is ready for a command */
00453     while (w->state != BK_WRITE_READY && w->state != BK_WRITE_ERR) {
00454         pthread_cond_wait(&w->cv, &w->mtx);
00455     }
00456     if (w->state == BK_WRITE_ERR) {
00457         pthread_mutex_unlock(&w->mtx);
00458         err("background thread failed to write() disk block");
00459     }
00460     /* issue thread exit command ... */
00461     w->state = BK_WRITE_EXIT;
00462     pthread_cond_signal(&w->cv);
00463     pthread_mutex_unlock(&w->mtx);
00464     /* ... and wait for writer thread to terminate */
00465     pthread_join(w->thr, NULL);
00466     /* clean up */
00467     if (munmap(w->mem, w->memsz) != 0) {
00468         err("munmap() of write buffers failed");
00469     }
00470     if (fdatasync(w->fd) != 0) {
00471         err("fdatasync() failed");
00472     }
00473     if (close(w->fd) != 0) {
00474         err("file close() failed");
00475     }
00476     pthread_cond_destroy(&w->cv);
00477     pthread_mutex_destroy(&w->mtx);
00478     pthread_attr_destroy(&w->attr);
00479     free(w);
00480 }
00481 
00482 
00483 /*  Appends data to the given block writer.
00484  */
00485 HTM_INLINE void blk_writer_append(struct blk_writer * const w,
00486                                   const void * const data,
00487                                   const size_t nbytes,
00488                                   void (*sortfn)(void *, size_t))
00489 {
00490     size_t i = w->i;
00491     if (i + nbytes > w->n) {
00492         blk_writer_issue(w, sortfn);
00493         i = 0;
00494     }
00495     assert(i + nbytes <= w->n);
00496     memcpy(w->buf + i, data, nbytes);
00497     w->i = i + nbytes;
00498 }
00499 
00500 
00501 /* ---- Multi-way external merge sort ---- */
00502 
00503 /*  A contiguous sequence of sorted items, dubbed a multi-way merge segment.
00504  */
00505 struct mrg_seg {
00506     const void *cur;
00507     const void *end;
00508     const void *blk;
00509 };
00510 
00511 
00512 /*  Initializes a merge segment.
00513  */
00514 static void mrg_seg_init(struct mrg_seg * const s,
00515                          const void * start,
00516                          const void * end,
00517                          const size_t blksz)
00518 {
00519     size_t n;
00520     const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE);
00521     size_t nbytes = (const char *) end - (const char *) start;
00522     if (end <= start || blksz % pagesz != 0) {
00523         err("Invalid merge segment");
00524     }
00525     s->cur = start;
00526     s->end = end;
00527     n = ((size_t) start) % pagesz;
00528     if (n != 0) {
00529         start = (const char *) start - n;
00530         nbytes += n;
00531     }
00532     n = (nbytes > 2*blksz ? 2*blksz : nbytes);
00533     if (n % pagesz != 0) {
00534         n += pagesz - n % pagesz;
00535     }
00536     s->blk = (const char *) start + blksz;
00537     if (madvise((void *) start, n, MADV_WILLNEED) != 0) {
00538         err("madvise() failed");
00539     }
00540 }
00541 
00542 
00543 /*  Handles prefetch and merge segment exhaustion.
00544  */
00545 static int mrg_seg_advance(struct mrg_seg * const s,
00546                            const size_t blksz,
00547                            const void * const cur)
00548 {
00549     const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE);
00550 
00551     if (cur == s->end) {
00552         void *start = (char *) s->blk - blksz;
00553         size_t n = (char *) s->end - (char *) start;
00554         if (n % pagesz != 0) {
00555             n += pagesz - n % pagesz;
00556         }
00557         if (madvise(start, n, MADV_DONTNEED) != 0) {
00558             err("madvise() failed");
00559         }
00560         return 0;
00561     }
00562     assert(cur >= s->blk && cur < s->end);
00563     if (madvise((char *) s->blk - blksz, blksz, MADV_DONTNEED) != 0) {
00564         err("madvise() failed");
00565     }
00566     s->cur = cur;
00567     s->blk = (const char *) s->blk + blksz;
00568     if (s->blk < s->end) {
00569         size_t n = (char *) s->end - (char *) s->blk;
00570         if (n >= blksz) {
00571             n = blksz;
00572         } else if (n % pagesz != 0) {
00573             n += pagesz - n % pagesz;
00574         }
00575         if (madvise((void *) s->blk, n, MADV_WILLNEED) != 0) {
00576             err("madvise() failed");
00577         }
00578     }
00579     return 1;
00580 }
00581 
00582 
00583 /*  Consumes one item in the given merge segment; returns 0 if there
00584     are no more items in the segment.
00585  */
00586 HTM_INLINE int mrg_seg_consume(struct mrg_seg * const s,
00587                                const size_t blksz,
00588                                const size_t itemsz)
00589 {
00590     const void *cur = (const char *) s->cur + itemsz;
00591     if (cur < s->end && cur < s->blk) {
00592         s->cur = cur;
00593         return 1;
00594     }
00595     return mrg_seg_advance(s, blksz, cur);
00596 }
00597 
00598 
00599 /*  Adds segs[n] to the min-heap segs[0], segs[1], ..., segs[n - 1].
00600  */
00601 static void heap_up(struct mrg_seg *segs,
00602                     size_t n,
00603                     int (*cmpfn)(const void *, const void *))
00604 {
00605     struct mrg_seg tmp;
00606     size_t p;
00607 
00608     while (n != 0) {
00609         p = (n - 1) / 2;
00610         if ((*cmpfn)(segs[p].cur, segs[n].cur) != 0) {
00611             break;
00612         }
00613         tmp = segs[p];
00614         segs[p] = segs[n];
00615         segs[n] = tmp;
00616         n = p;
00617     }
00618 }
00619 
00620 
00621 /*  Fix possible min-heap violation by segs[0].
00622  */
00623 static void heap_down(struct mrg_seg *segs,
00624                       const size_t n,
00625                       int (*cmpfn)(const void *, const void *))
00626 {
00627     struct mrg_seg tmp;
00628     size_t i;
00629 
00630     if (n > 1) {
00631         i = 0;
00632         while (1) {
00633             size_t left = 2*i + 1;
00634             size_t right = 2*i + 2;
00635             size_t least = i;
00636             if (left < n && (*cmpfn)(segs[left].cur, segs[i].cur) != 0) {
00637                 least = left;
00638             }
00639             if (right < n && (*cmpfn)(segs[right].cur, segs[least].cur) != 0) {
00640                 least = right;
00641             }
00642             if (least == i) {
00643                 break;
00644             }
00645             tmp = segs[i];
00646             segs[i] = segs[least];
00647             segs[least] = tmp;
00648             i = least;
00649         }
00650     }
00651 }
00652 
00653 
00654 /*  Performs one multi-way merge pass.
00655  */
00656 static void mrg_pass(struct blk_writer * const w,
00657                      const void * const data,
00658                      const struct mem_params * const mem,
00659                      const size_t filesz,
00660                      const size_t sortsz,
00661                      const size_t itemsz,
00662                      int (*cmpfn)(const void *, const void *))
00663 {
00664     struct mrg_seg *segs;
00665     size_t start;
00666 
00667     /* allocate merge segments */
00668     segs = (struct mrg_seg *) malloc(
00669         mem->k * sizeof(struct mrg_seg));
00670     if (segs == NULL) {
00671         err("malloc() failed");
00672     }
00673     for (start = 0; start < filesz;) {
00674         size_t ns, end;
00675         /* initialize up to mem->k merge segments */
00676         for (ns = 0; ns < mem->k && start < filesz; ++ns, start = end) {
00677             end = (start + sortsz > filesz ? filesz : start + sortsz);
00678             mrg_seg_init(&segs[ns],
00679                          (const char *) data + start,
00680                          (const char *) data + end,
00681                          mem->ioblksz);
00682             heap_up(segs, ns, cmpfn);
00683         }
00684         /* merge ns segments */
00685         while (ns > 0) {
00686             /* write minimum value from all ns merge segments to disk */
00687             blk_writer_append(w, segs->cur, itemsz, NULL);
00688             if (mrg_seg_consume(segs, mem->ioblksz, itemsz) == 0) {
00689                 segs[0] = segs[ns - 1];
00690                 --ns;
00691             }
00692             heap_down(segs, ns, cmpfn);
00693         }
00694     }
00695     free(segs);
00696 }
00697 
00698 
00699 /*  Computes the number of k-way merge passes required to sort n items,
00700     i.e. the ceiling of the base-k logarithm of n.
00701  */
00702 static int mrg_npasses(size_t n, size_t k) {
00703     int m = 1;
00704     size_t b = k;
00705     while (b < n) {
00706         b *= k;
00707         ++m;
00708     }
00709     return m;
00710 }
00711 
00712 
00713 /*  Sorts the given file using multi-way external merge sort.
00714 
00715     @param[in] file     File to sort. Runs of size mem->sortb are assumed
00716                         to be sorted already.
00717     @param[in] tmpl     Temporary file name template.
00718     @param[in] mem      Memory parameters.
00719     @param[in] itemsz   Size of a single item in bytes.
00720     @param[in] nitems   Number of items in file.
00721  */
00722 static void ext_sort(const char *file,
00723                      const char *scratch,
00724                      const struct mem_params *mem,
00725                      size_t itemsz,
00726                      size_t nitems,
00727                      int (*cmpfn)(const void *, const void *))
00728 {
00729     const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE);
00730     const size_t filesz = nitems * itemsz;
00731     size_t sortsz = mem->sortsz - mem->sortsz % itemsz;
00732     const size_t nblk = filesz / sortsz + (filesz % sortsz != 0 ? 1u : 0u);
00733     const int nmp = mrg_npasses(nblk, mem->k);
00734     int mp;
00735     double ttot;
00736 
00737     if (nblk < 2) {
00738         msg("Skipping multi-way merge step (%s already sorted)\n", file);
00739         return;
00740     }
00741     msg("Multi-way external merge sort of %s\n", file);
00742     ttot = now();
00743 
00744     /* Perform k-way merge passes; the sorted block grows by a
00745        factor of k on every pass */
00746     for (mp = 0; mp < nmp; ++mp, sortsz *= mem->k) {
00747         const char *inf;
00748         const char *outf;
00749         const void *data;
00750         struct blk_writer *w;
00751         double t;
00752         size_t nmap;
00753         int infd;
00754 
00755         msg("\t- merge pass %d/%d ... ", mp + 1, nmp);
00756         t = now();
00757 
00758         /* memory map input file and create writer for output file */
00759         if (mp % 2 == 0) {
00760             inf = file;
00761             outf = scratch;
00762         } else {
00763             inf = scratch;
00764             outf = file;
00765         }
00766         infd = open(inf, O_RDONLY);
00767         if (infd == -1) {
00768             err("failed to open file %s for reading", inf);
00769         }
00770         if (filesz % pagesz != 0) {
00771             nmap = filesz + (pagesz - filesz % pagesz);
00772         } else {
00773             nmap = filesz;
00774         }
00775         data = mmap(NULL, nmap, PROT_READ,
00776                     MAP_SHARED | MAP_NORESERVE, infd, 0);
00777         if (data == MAP_FAILED) {
00778             err("failed to mmap() file %s", inf);
00779         }
00780         if (madvise((void *) data, filesz, MADV_DONTNEED) != 0) {
00781             err("madvise() failed");
00782         }
00783         w = blk_writer_init(outf, mem->ioblksz);
00784         /* perform merges */
00785         mrg_pass(w, data, mem, filesz, sortsz, itemsz, cmpfn);
00786         /* cleanup */
00787         blk_writer_close(w, NULL);
00788         if (munmap((void *) data, nmap) != 0) {
00789             err("failed to munmap() file %s", inf);
00790         }
00791         if (close(infd) != 0) {
00792             err("failed to close() file %s", inf);
00793         }
00794         msg("%.3f sec\n", now() - t);
00795     }
00796     /* make sure sorted results are in data file and delete scratch file. */
00797     if (nmp % 2 == 1) {
00798         if (rename(scratch, file) != 0) {
00799             err("failed to rename file %s to %s", scratch, file);
00800         }
00801     } else {
00802         if (unlink(scratch) != 0) {
00803             err("failed to delete file %s", scratch);
00804         }
00805     }
00806     msg("\t%.3f sec total\n\n", now() - ttot);
00807 }
00808 
00809 
00810 /* ---- Fast constant size memory allocator ---- */
00811 
00812 /*  Undefine FAST_ALLOC (or define it as 0) to allocate nodes with malloc()
00813     instead. Useful when checking memory safety, e.g. with valgrind.
00814  */
00815 #define FAST_ALLOC 1
00816 
00817 #if FAST_ALLOC
00818 #   define ARENA_SEGSZ 16*1024*1024
00819 
00820 /*  A contiguous memory segment belonging to an arena.
00821  */
00822 struct arena_seg {
00823     struct arena_seg *prev;
00824     void *mem;
00825 };
00826 
00827 static struct arena_seg * arena_seg_init(struct arena_seg * const prev,
00828                                          const size_t itemsz)
00829 {
00830     unsigned char *node, *end;
00831     struct arena_seg *seg = (struct arena_seg *) malloc(sizeof(struct arena_seg));
00832     if (seg == NULL) {
00833         err("malloc() failed");
00834     }
00835     seg->prev = prev;
00836     seg->mem = mmap(NULL, ARENA_SEGSZ, PROT_READ | PROT_WRITE,
00837                     MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
00838     if (seg->mem == MAP_FAILED) {
00839         err("mmap() failed");
00840     }
00841     /* initialize free list */
00842     node = (unsigned char *) seg->mem;
00843     end = node + (ARENA_SEGSZ/itemsz - 1)*itemsz;
00844     while (node < end) {
00845         *((void **) node) = node + itemsz;
00846         node += itemsz;
00847     }
00848     *((void **) node) = NULL;
00849     return seg;
00850 }
00851 
00852 /*  A (non-shrinkable) memory arena. Memory nodes can be freed one
00853     at a time, or en-masse.
00854  */
00855 struct arena {
00856     struct arena_seg *tail; /* reverse linked list of memory segments */
00857     struct mem_node *head;  /* head of linked list of free memory locations */
00858     size_t itemsz;
00859     size_t nseg;
00860 };
00861 
00862 static void arena_init(struct arena * const a, const size_t itemsz)
00863 {
00864     a->tail = arena_seg_init(NULL, itemsz);
00865     a->head = a->tail->mem;
00866     a->itemsz = itemsz;
00867     a->nseg = 1;
00868 }
00869 
00870 static void arena_destroy(struct arena * const a)
00871 {
00872     struct arena_seg *seg = a->tail;
00873     while (seg != NULL) {
00874         struct arena_seg *prev = seg->prev;
00875         if (munmap(seg->mem, ARENA_SEGSZ) != 0) {
00876             err("munmap() failed");
00877         }
00878         seg->prev = NULL;
00879         seg->mem = NULL;
00880         free(seg);
00881         seg = prev;
00882     }
00883     a->tail = NULL;
00884     a->head = NULL;
00885     a->nseg = 0;
00886 }
00887 
00888 HTM_INLINE void * arena_alloc(struct arena * const a)
00889 {
00890     void *item;
00891     if (a->head == NULL) {
00892         a->tail = arena_seg_init(a->tail, a->itemsz);
00893         a->head = a->tail->mem;
00894         ++a->nseg;
00895     }
00896     item = a->head;
00897     a->head = *((void **) item);
00898     return item;
00899 }
00900 
00901 HTM_INLINE void arena_free(struct arena * const a, void * const n)
00902 {
00903     *((void **) n) = a->head;
00904     a->head = n;
00905 }
00906 
00907 #endif /* FAST_ALLOC */
00908 
00909 
00910 /* ================================================================ */
00911 /*           Phase 1: Produce data file from ASCII inputs           */
00912 /* ================================================================ */
00913 
00914 /*  An entry in an HTM tree.
00915  */
00916 struct tree_entry {
00917     int64_t htmid;
00918     int64_t rowid;
00919     struct htm_sc sc;
00920 } HTM_ALIGNED(16);
00921 
00922 /*  Tests whether tree entry e1 is less than e2; this is
00923     the same as testing whether the HTM ID of e1 is less than e2.
00924     Row IDs are used to break ties.
00925   */
00926 HTM_INLINE int tree_entry_lt(const struct tree_entry *e1,
00927                              const struct tree_entry *e2)
00928 {
00929     return (e1->htmid < e2->htmid ||
00930             (e1->htmid == e2->htmid && e1->rowid < e2->rowid));
00931 }
00932 
00933 /*  Returns 1 if the first tree entry is less than the second.
00934  */
00935 static int tree_entry_cmp(const void *e1, const void *e2)
00936 {
00937     return tree_entry_lt((const struct tree_entry *) e1,
00938                          (const struct tree_entry *) e2);
00939 }
00940 
00941 /* ---- Quicksort of tree entries ---- */
00942 
00943 static void tree_entry_isort(struct tree_entry *entries, size_t n)
00944 {
00945     size_t i, j, k;
00946     for (i = 0; i < n; ++i) {
00947         k = i;
00948         for (j = i + 1; j < n; ++j) {
00949             if (tree_entry_lt(&entries[j], &entries[k])) {
00950                 k = j;
00951             }
00952         }
00953         if (k != i) {
00954             struct tree_entry tmp = entries[k];
00955             entries[k] = entries[i];
00956             entries[i] = tmp;
00957         }
00958     }
00959 }
00960 
00961 static void tree_entry_qsort(struct tree_entry *entries,
00962                              size_t left,
00963                              size_t right)
00964 {
00965     struct tree_entry pivot;
00966     size_t l, r, mid;
00967 
00968     while (1) {
00969 
00970         if (right <= left + 8) {
00971             tree_entry_isort(entries + left, right - left + 1);
00972             return;
00973         }
00974 
00975         /* find median-of-3 */
00976         mid = left + ((right - left) >> 1);
00977         if (tree_entry_lt(entries + left, entries + mid)) {
00978             if (tree_entry_lt(entries + right, entries + left)) {
00979                 mid = left; /* right, left, mid */
00980             } else if (tree_entry_lt(entries + right, entries + mid)) {
00981                 mid = right; /* left, right, mid */
00982             } /* left, mid, right */
00983         } else {
00984             if (tree_entry_lt(entries + left, entries + right)) {
00985                 mid = left; /* mid, left, right */
00986             } else if (tree_entry_lt(entries + mid, entries + right)) {
00987                 mid = right; /* mid, right, left */
00988             } /* right, mid, left */
00989         }
00990 
00991         /* set pivot to median-of-3 and store at left-most array location */
00992         pivot = entries[mid];
00993         if (mid != left) {
00994             entries[mid] = entries[left];
00995             entries[left] = pivot;
00996         }
00997 
00998         /* partition around pivot */
00999         l = left;
01000         r = right;
01001         do {
01002             while (tree_entry_lt(&pivot, &entries[right])) {
01003                 --right;
01004                 if (left >= right) {
01005                     goto end;
01006                 }
01007             }
01008             entries[left] = entries[right];
01009             do {
01010                 ++left;
01011                 if (left >= right) {
01012                     left = right;
01013                     goto end;
01014                 }
01015             } while (tree_entry_lt(&entries[left], &pivot));
01016             entries[right] = entries[left];
01017             --right;
01018         } while (left < right);
01019 end:
01020         entries[left] = pivot;
01021 
01022         /* recurse on smaller partition */
01023         if (2*left <= r + l) {
01024             if (l < left) {
01025                 tree_entry_qsort(entries, l, left - 1);
01026             }
01027             ++left;
01028             right = r;
01029         } else {
01030             if (r > left) {
01031                 tree_entry_qsort(entries, left + 1, r);
01032             }
01033             right = left - 1;
01034             left = l;
01035         }
01036         /* process larger partition without recursing to limit stack usage */
01037     }
01038 }
01039 
01040 
01041 /*  Sorts a memory block containing tree entries.
01042  */
01043 static void tree_entry_sort(void *data, size_t nbytes)
01044 {
01045     size_t n;
01046     assert(((size_t) data) % 16 == 0 &&
01047            "block pointer not aligned to a multiple of 16 bytes");
01048     assert(nbytes % sizeof(struct tree_entry) == 0 &&
01049            "block size not a multiple of sizeof(struct tree_entry)");
01050     n = nbytes / sizeof(struct tree_entry);
01051     if (n > 1) {
01052         tree_entry_qsort((struct tree_entry *) data, 0, n - 1);
01053     }
01054 }
01055 
01056 
01057 /* ---- Convert text inputs to a block-sorted binary tree entry file ---- */
01058 
01059 static char * eat_delim(char *s, char delim, const char *fname, size_t lineno)
01060 {
01061     for (; isspace(*s) && *s != delim; ++s) { }
01062     if (*s != delim) {
01063         err("[%s:%llu] - invalid/truncated record",
01064             fname, (unsigned long long) lineno);
01065     }
01066     return s + 1;
01067 }
01068 
01069 static char * eat_ws(char *s, char delim, const char *fname, size_t lineno)
01070 {
01071     for (; isspace(*s) && *s != delim; ++s) { }
01072     if (*s == delim || *s == '\0') {
01073         err("[%s:%llu] - invalid/truncated record",
01074             fname, (unsigned long long) lineno);
01075     }
01076     return s;
01077 }
01078 
01079 
01080 /*  Converts ASCII input files to a block-sorted binary file.
01081  */
01082 static size_t blk_sort_ascii(char **infile,
01083                              const int nfile,
01084                              const char *outfile,
01085                              const char delim,
01086                              const struct mem_params * const mem)
01087 {
01088     char line[16384];
01089     size_t nentries;
01090     struct blk_writer *out;
01091     double ttot;
01092     int i;
01093 
01094     if (mem->sortsz % sizeof(struct tree_entry) != 0) {
01095         err("Write block size is not a multiple of "
01096             "sizeof(struct tree_entry)");
01097     }
01098     msg("Creating block-sorted tree entry file %s from ASCII file(s)\n",
01099         outfile);
01100     ttot = now();
01101     nentries = 0;
01102     out = blk_writer_init(outfile, mem->sortsz);
01103 
01104     /* For each input file... */
01105     for (i = 0; i < nfile; ++i) {
01106         FILE *f;
01107         size_t lineno;
01108         struct tree_entry entry;
01109         struct htm_v3 v;
01110         double t;
01111 
01112         msg("\t- processing %s ... ", infile[i]);
01113         t = now();
01114 
01115         /* open input file */
01116         f = fopen(infile[i], "r");
01117         if (f == NULL) {
01118             err("Failed to open file %s for reading", infile[i]);
01119         }
01120         for (lineno = 1; fgets(line, sizeof(line), f) != NULL; ++lineno) {
01121             char *s, *endptr;
01122             double lon, lat;
01123             size_t len = strlen(line);
01124 
01125             if (line[len - 1] != '\n') {
01126                 if (feof(f) == 0) {
01127                     err("Line %llu of file %s is too long (> %d characters)",
01128                         (unsigned long long) lineno, infile[i],
01129                         (int) sizeof(line));
01130                 }
01131             }
01132             s = eat_ws(line, delim, infile[i], lineno);
01133             entry.rowid = (int64_t) strtoll(s, &endptr, 0);
01134             if (endptr == s || endptr == NULL || errno != 0) {
01135                 err("[%s:%llu] - failed to convert row_id to an integer",
01136                     infile[i], (unsigned long long) lineno);
01137             }
01138             s = eat_delim(endptr, delim, infile[i], lineno);
01139             s = eat_ws(s, delim, infile[i], lineno);
01140             lon = strtod(s, &endptr);
01141             if (endptr == s || endptr == NULL || errno != 0) {
01142                 err("[%s:%llu] - failed to convert right ascension/longitude "
01143                     "to a double", infile[i], (unsigned long long) lineno);
01144             }
01145             s = eat_delim(endptr, delim, infile[i], lineno);
01146             s = eat_ws(s, delim, infile[i], lineno);
01147             lat = strtod(s, &endptr);
01148             if (endptr == s || endptr == NULL || errno != 0) {
01149                 err("[%s:%llu] - failed to convert declination/latitude "
01150                     "to a double", infile[i], (unsigned long long) lineno);
01151             }
01152             s = endptr;
01153             if (*s != delim && *s != '\0' && !isspace(*s)) {
01154                 err("[%s:%llu] - invalid record",
01155                     infile[i], (unsigned long long) lineno);
01156             }
01157             /* compute and store tree_entry for line */
01158             if (htm_sc_init(&entry.sc, lon, lat) != HTM_OK) {
01159                 err("[%s:%llu] - invalid spherical coordinates",
01160                     infile[i], (unsigned long long) lineno);
01161             }
01162             if (htm_sc_tov3(&v, &entry.sc) != HTM_OK) {
01163                 err("[%s:%llu] - failed to convert spherical coordinates "
01164                     "to a unit vector", infile[i], (unsigned long long) lineno);
01165             }
01166             entry.htmid = htm_v3_id(&v, 20);
01167             if (entry.htmid == 0) {
01168                 err("[%s:%llu] - failed to compute HTM ID for spherical "
01169                     "coordinates", infile[i], (unsigned long long) lineno);
01170             }
01171             blk_writer_append(out, &entry, sizeof(struct tree_entry),
01172                               &tree_entry_sort);
01173         }
01174         if (ferror(f) != 0) {
01175             err("failed to read file %s", infile[i]);
01176         }
01177         if (fclose(f) != 0) {
01178             err("failed to close file %s", infile[i]);
01179         }
01180         nentries += lineno - 1;
01181         msg("%llu records in %.3f sec\n",
01182             (unsigned long long) lineno, now() - t);
01183         /* advance to next input file */
01184     }
01185 
01186     /* flush and close block writer */
01187     blk_writer_close(out, &tree_entry_sort);
01188     msg("\t%.3f sec for %llu records total\n\n",
01189         now() - ttot, (unsigned long long) nentries);
01190     return nentries;
01191 }
01192 
01193 
01194 /* ================================================================ */
01195 /*               Phase 2: Tree generation and layout                */
01196 /* ================================================================ */
01197 
01198 /*  Number of levels-of-detail used by Split-and-Refine.
01199  */
01200 #define NLOD 5
01201 
01202 /*  Tree layout block sizes in bytes, from largest to smallest.
01203  */
01204 static const uint32_t layout_size[NLOD] = {
01205     2097152,    /* 2MiB: large page size for modern x86 processors. */
01206     65536,      /* Between small and large page size. Chosen in hopes
01207                    of improving OS disk prefetch effectiveness. */
01208     4096,       /* 4KiB: default page size for modern x86 processors */
01209     256,        /* Between cache line and small page sizes. Chosen in hopes
01210                    of improving HW cache-line prefetch effectiveness. */
01211     64          /* L1/L2/L3 cache-line size for modern x86 processors. */
01212 };
01213 
01214 /*  A node ID, consisting of NLOD block IDs (one per layout block size)
01215     and the index of the node in a post-order traversal. This last index
01216     makes node IDs unique (more than one node might fit in a block at
01217     the finest LOD).
01218  */
01219 struct node_id {
01220     uint64_t block[NLOD + 1];
01221 } HTM_ALIGNED(16);
01222 
01223 /*  On-disk representation of a tree node.
01224  */
01225 struct disk_node {
01226     struct node_id id;
01227     uint64_t count;
01228     uint64_t index;
01229     struct node_id child[4];
01230 } HTM_ALIGNED(16);
01231 
01232 /*  Returns 1 if the given node ID corresponds to an empty child
01233     (all block IDs are zero).
01234  */
01235 HTM_INLINE int node_empty(const struct node_id * const id)
01236 {
01237     int i;
01238     for (i = 0; i < NLOD + 1; ++i) {
01239         if (id->block[i] != 0) {
01240             return 0;
01241         }
01242     }
01243     return 1;
01244 }
01245 
01246 /*  Returns 1 if the given node IDs are equal.
01247  */
01248 HTM_INLINE int node_id_eq(const struct node_id * const id1,
01249                           const struct node_id * const id2)
01250 {
01251     int i;
01252     for (i = 0; i < NLOD + 1; ++i) {
01253         if (id1->block[i] != id2->block[i]) {
01254             return 0;
01255         }
01256     }
01257     return 1;
01258 }
01259 
01260 /*  Returns 1 if the first node ID is less than the second. We say that
01261     a node N1 is less than N2 if the string of block IDs for N1 is
01262     lexicographically less than the string for N2.
01263  */
01264 HTM_INLINE int node_id_lt(const struct node_id * const id1,
01265                           const struct node_id * const id2)
01266 {
01267     int i;
01268     for (i = 0; i < NLOD + 1; ++i) {
01269         if (id1->block[i] < id2->block[i]) {
01270             return 1;
01271         } else if (id1->block[i] > id2->block[i]) {
01272             break;
01273         }
01274     }
01275     return 0;
01276 }
01277 
01278 /*  Compares nodes by ID.
01279  */
01280 HTM_INLINE int disk_node_lt(const struct disk_node *n1,
01281                             const struct disk_node *n2)
01282 {
01283     return node_id_lt(&n1->id, &n2->id);
01284 }
01285 
01286 static int disk_node_cmp(const void *n1, const void * n2)
01287 {
01288     return disk_node_lt((const struct disk_node *) n1,
01289                         (const struct disk_node *) n2);
01290 }
01291 
01292 
01293 /* ---- Quicksort of disk nodes ---- */
01294 
01295 static void disk_node_isort(struct disk_node *nodes, size_t n)
01296 {
01297     size_t i, j, k;
01298     for (i = 0; i < n; ++i) {
01299         k = i;
01300         for (j = i + 1; j < n; ++j) {
01301             if (disk_node_lt(&nodes[j], &nodes[k])) {
01302                 k = j;
01303             }
01304         }
01305         if (k != i) {
01306             struct disk_node tmp = nodes[k];
01307             nodes[k] = nodes[i];
01308             nodes[i] = tmp;
01309         }
01310     }
01311 }
01312 
01313 static void disk_node_qsort(struct disk_node *nodes, size_t left, size_t right)
01314 {
01315     struct disk_node pivot;
01316     size_t l, r, mid;
01317 
01318     while (1) {
01319 
01320         if (right <= left + 8) {
01321             disk_node_isort(nodes + left, right - left + 1);
01322             return;
01323         }
01324 
01325         /* find median-of-3 */
01326         mid = left + ((right - left) >> 1);
01327         if (disk_node_lt(nodes + left, nodes + mid)) {
01328             if (disk_node_lt(nodes + right, nodes + left)) {
01329                 mid = left; /* right, left, mid */
01330             } else if (disk_node_lt(nodes + right, nodes + mid)) {
01331                 mid = right; /* left, right, mid */
01332             } /* left, mid, right */
01333         } else {
01334             if (disk_node_lt(nodes + left, nodes + right)) {
01335                 mid = left; /* mid, left, right */
01336             } else if (disk_node_lt(nodes + mid, nodes + right)) {
01337                 mid = right; /* mid, right, left */
01338             } /* right, mid, left */
01339         }
01340 
01341         /* set pivot to median-of-3 and store at left-most array location */
01342         pivot = nodes[mid];
01343         if (mid != left) {
01344             nodes[mid] = nodes[left];
01345             nodes[left] = pivot;
01346         }
01347 
01348         /* partition around pivot */
01349         l = left;
01350         r = right;
01351         do {
01352             while (disk_node_lt(&pivot, &nodes[right])) {
01353                 --right;
01354                 if (left >= right) {
01355                     goto end;
01356                 }
01357             }
01358             nodes[left] = nodes[right];
01359             do {
01360                 ++left;
01361                 if (left >= right) {
01362                     left = right;
01363                     goto end;
01364                 }
01365             } while (disk_node_lt(&nodes[left], &pivot));
01366             nodes[right] = nodes[left];
01367             --right;
01368         } while (left < right);
01369 end:
01370         nodes[left] = pivot;
01371 
01372         /* recurse on smaller partition */
01373         if (2*left <= r + l) {
01374             if (l < left) {
01375                 disk_node_qsort(nodes, l, left - 1);
01376             }
01377             ++left;
01378             right = r;
01379         } else {
01380             if (r > left) {
01381                 disk_node_qsort(nodes, left + 1, r);
01382             }
01383             right = left - 1;
01384             left = l;
01385         }
01386         /* process larger partition without recursing to limit stack usage */
01387     }
01388 }
01389 
01390 
01391 /*  Sorts a memory block containing disk nodes.
01392  */
01393 static void disk_node_sort(void *data, size_t nbytes)
01394 {
01395     size_t n;
01396     assert(((size_t) data) % 16 == 0 &&
01397            "block pointer not aligned to a multiple of 16 bytes");
01398     assert(nbytes % sizeof(struct disk_node) == 0 &&
01399            "block size not a multiple of sizeof(struct disk_node)");
01400     n = nbytes / sizeof(struct disk_node);
01401     if (n > 1) {
01402         disk_node_qsort((struct disk_node *) data, 0, n - 1);
01403     }
01404 }
01405 
01406 
01407 /* ---- In-memory node representation ---- */
01408 
01409 /*  Status of an in-memory tree node.
01410  */
01411 enum node_status {
01412     NODE_INIT = 0,  /* node is minty fresh */
01413     NODE_EMITTED,   /* node has been processed by emit_node() */
01414     NODE_LAID_OUT,  /* node has been processed by layout_node() */
01415 };
01416 
01417 /*  In-memory representation of a tree node.  Note that block size/depth
01418     is packed into a single 32 bit integer; as a result, nodes occupy
01419     exactly 256 bytes and are nicely cache aligned.
01420  */
01421 struct mem_node {
01422     struct node_id id;        /* Hierarchical ID for node */
01423     int64_t htmid;            /* HTM ID of node */
01424     uint64_t index;           /* File offset of first point in node */
01425     uint64_t count;           /* Number of points in node */
01426     enum node_status status;
01427     uint32_t blockinfo[NLOD]; /* Clark & Munro: block depth (8 MSBs) and
01428                                  block size (24 LSBs) for each LOD. */
01429     struct mem_node *child[4];
01430 } HTM_ALIGNED(16);
01431 
01432 
01433 /* get/set block size/depth from 32 bit blockinfo */
01434 HTM_INLINE uint32_t get_block_size(uint32_t blockinfo) {
01435     return blockinfo & 0xffffff;
01436 }
01437 HTM_INLINE uint8_t get_block_depth(uint32_t blockinfo) {
01438     return blockinfo >> 24;
01439 }
01440 HTM_INLINE uint32_t make_block_info(uint32_t size, uint8_t depth) {
01441     return (size & 0xffffff) | (((uint32_t) depth) << 24);
01442 }
01443 
01444 
01445 /* ---- Tree generation ---- */
01446 
01447 /*  Tree generation context.
01448  */
01449 struct tree_gen_context {
01450 #if FAST_ALLOC
01451     struct arena ar;        /* node memory allocator */
01452 #endif
01453     size_t   nnodes;        /* number of nodes in the tree */
01454     uint64_t leafthresh;    /* maximum # of points per leaf */
01455     uint64_t poidx;         /* next post-order tree traversal index */
01456     uint64_t blockid[NLOD]; /* index of next block ID to assign for each LOD */
01457     struct blk_writer *wr;  /* node writer */
01458 };
01459 
01460 static void tree_gen_context_init(struct tree_gen_context * const ctx,
01461                                   const uint64_t leafthresh,
01462                                   const char * const file,
01463                                   const size_t blksz)
01464 {
01465     int i;
01466     ctx->nnodes = 0;
01467     ctx->leafthresh = leafthresh;
01468     ctx->poidx = 0;
01469     for (i = 0; i < NLOD; ++i) {
01470         ctx->blockid[i] = 0;
01471     }
01472 #if FAST_ALLOC
01473     arena_init(&ctx->ar, sizeof(struct mem_node));
01474 #endif
01475     ctx->wr = blk_writer_init(file, blksz);
01476 }
01477 
01478 static void tree_gen_context_destroy(struct tree_gen_context * const ctx)
01479 {
01480 #if FAST_ALLOC
01481     arena_destroy(&ctx->ar);
01482 #endif
01483     blk_writer_close(ctx->wr, &disk_node_sort);
01484     ctx->wr = NULL;
01485 }
01486 
01487 
01488 /*  Assigns a block ID at the specified level-of-detail to all nodes in
01489     the sub-tree rooted at n that do not already have a block ID at that
01490     level-of-detail. When assigning a block ID to a node, check whether all
01491     block IDs are now valid and if so write the node out to disk.
01492     Children of nodes that are written are destroyed.
01493  */
01494 static void assign_block(struct tree_gen_context * const ctx,
01495                          struct mem_node * const n,
01496                          const uint64_t blockid,
01497                          const int lod)
01498 {
01499     int i;
01500     if (n->id.block[lod] != 0) {
01501         return;
01502     }
01503     /* visit children */
01504     for (i = 0; i < 4; ++i) {
01505         if (n->child[i] != NULL) {
01506             assign_block(ctx, n->child[i], blockid, lod);
01507         }
01508     }
01509     n->id.block[lod] = blockid;
01510     for (i = 0; i < NLOD; ++i) {
01511         if (n->id.block[i] == 0) {
01512             return;
01513         }
01514     }
01515     /* write node to disk */
01516     {
01517         struct disk_node d;
01518         d.id = n->id;
01519         d.count = n->count;
01520         d.index = n->index;
01521         for (i = 0; i < 4; ++i) {
01522             struct mem_node *tmp = n->child[i];
01523             if (tmp != NULL) {
01524                 /* copy id of child to disk node, throw away child */
01525                 d.child[i] = tmp->id;
01526 #if FAST_ALLOC
01527                 arena_free(&ctx->ar, tmp);
01528 #else
01529                 free(tmp);
01530 #endif
01531                 n->child[i] = NULL;
01532             } else {
01533                 memset(&d.child[i], 0, sizeof(struct node_id));
01534             }
01535         }
01536         blk_writer_append(ctx->wr, &d, sizeof(struct disk_node), &disk_node_sort);
01537         ++ctx->nnodes;
01538     }
01539 }
01540 
01541 
01542 /*  Estimates the compressed on-disk size of a tree node.
01543  */
01544 static uint32_t estimate_node_size(const struct mem_node * const node,
01545                                    const uint32_t nchild)
01546 {
01547     uint32_t sz = htm_varint_len(node->index) + htm_varint_len(node->count);
01548     if (nchild > 0) {
01549         /* There is no way to compute size of a child offset accurately
01550            without knowing the final node layout, so use a guess of 4 bytes
01551            per non-empty child. The offset for an empty child (0) will
01552            will occupy exactly 1 byte, regardless of layout. */
01553         sz += nchild*3 + 4;
01554     }
01555     return sz;
01556 }
01557 
01558 struct child_info {
01559     struct mem_node *node;
01560     uint32_t size;
01561     uint8_t depth;
01562     int8_t idx;
01563 };
01564 
01565 /*  Insertion sort of (at most 8) child_info structs; they are sorted
01566     by block depth and size.
01567  */
01568 static void child_info_isort(struct child_info * const c, const int n)
01569 {
01570     if (n < 2) {
01571         return;
01572     } else {
01573         int i, j, k;
01574         for (i = 0; i < n; ++i) {
01575             k = i;
01576             for (j = i + 1; j < n; ++j) {
01577                 if (c[j].depth < c[k].depth ||
01578                     (c[j].depth == c[k].depth && c[j].size < c[k].size)) {
01579                     k = j;
01580                 }
01581             }
01582             if (k != i) {
01583                 struct child_info tmp = c[k];
01584                 c[k] = c[i];
01585                 c[i] = tmp;
01586             }
01587         }
01588     }
01589 }
01590 
01591 static void layout_node(struct mem_node * const node,
01592                         struct tree_gen_context * const ctx)
01593 {
01594     struct child_info cinfo[4];
01595     int c, nchild;
01596 
01597     if (node->status > NODE_EMITTED) {
01598         return;
01599     }
01600 
01601     /* visit children */
01602     for (c = 0, nchild = 0; c < 4; ++c) {
01603         if (node->child[c] != NULL) {
01604             layout_node(node->child[c], ctx);
01605             cinfo[nchild].node = node->child[c];
01606             cinfo[nchild].idx = c;
01607             ++nchild;
01608         }
01609     }
01610 
01611     /* update status and assign post-order index. */
01612     node->status = NODE_LAID_OUT;
01613     node->id.block[5] = ++ctx->poidx;
01614 
01615     /* Clark & Munro at every level-of-detail */
01616     if (nchild == 0) {
01617         /* leaf */
01618         int lod;
01619         const uint32_t nodesz = estimate_node_size(node, nchild);
01620         const uint32_t info = make_block_info(nodesz, 1u);
01621         for (lod = 0; lod < NLOD; ++lod) {
01622             node->blockinfo[lod] = info;
01623             if (nodesz > layout_size[lod]) {
01624                 uint64_t blockid = ++ctx->blockid[lod];
01625                 assign_block(ctx, node, blockid, lod);
01626             }
01627         }
01628     } else {
01629         /* internal node */
01630         int lod;
01631         const uint32_t nodesz = estimate_node_size(node, nchild);
01632         for (lod = 0; lod < NLOD; ++lod) {
01633             uint64_t blockid;
01634             uint32_t totsz = nodesz;
01635             int close = 0, endclose = nchild;
01636 
01637             for (c = 0; c < nchild; ++c) {
01638                 uint32_t s;
01639                 struct mem_node *tmp = cinfo[c].node;
01640                 s = get_block_size(tmp->blockinfo[lod]);
01641                 cinfo[c].size = s;
01642                 cinfo[c].depth = get_block_depth(tmp->blockinfo[lod]);
01643                 totsz += s;
01644             }
01645             child_info_isort(cinfo, nchild);
01646 
01647             if (cinfo[0].depth == cinfo[nchild - 1].depth) {
01648                 /* all children have the same block depth */
01649                 if (totsz <= layout_size[lod]) {
01650                     /* children and parent all fit in one block; merge them */
01651                     node->blockinfo[lod] = make_block_info(totsz, cinfo[0].depth);
01652                     continue;
01653                 } else {
01654                     /* cannot fit family into one block: scan children
01655                        from smallest to largest, placing as many as possible
01656                        in the same block as the parent. */
01657                     totsz = nodesz;
01658                     for (close = 0; close < nchild - 1; ++close) {
01659                         if (totsz + cinfo[close].size > layout_size[lod]) {
01660                             break;
01661                         }
01662                         totsz += cinfo[close].size;
01663                     }
01664                     /* increase block depth of parent by 1 */
01665                     node->blockinfo[lod] = make_block_info(totsz, cinfo[0].depth + 1);
01666                 }
01667             } else {
01668                 /* nchild > 1, not all children have the same block depth */
01669                 totsz = nodesz;
01670                 for (endclose = nchild - 1; endclose > 0; --endclose) {
01671                     totsz += cinfo[endclose].size;
01672                     if (cinfo[endclose - 1].depth != cinfo[nchild - 1].depth) {
01673                         break;
01674                     }
01675                 }
01676                 if (totsz < layout_size[lod]) {
01677                     /* merge parent and largest-depth children */
01678                     node->blockinfo[lod] = make_block_info(totsz, cinfo[nchild - 1].depth);
01679                 } else {
01680                     /* fresh block for parent, increase block depth by 1 */
01681                     node->blockinfo[lod] = make_block_info(nodesz, cinfo[nchild - 1].depth + 1);
01682                     endclose = nchild;
01683                 }
01684             }
01685             /* scan remaining children from smallest to largest, merging
01686                runs of children into a single block where possible. */
01687             totsz = cinfo[close].size;
01688             for (c = close + 1; c < endclose; ++c) {
01689                 if (totsz + cinfo[c].size > layout_size[lod]) {
01690                     blockid = ++ctx->blockid[lod];
01691                     for (; close < c; ++close) {
01692                         assign_block(ctx, cinfo[close].node, blockid, lod);
01693                     }
01694                     totsz = cinfo[c].size;
01695                 } else {
01696                     totsz += cinfo[c].size;
01697                 }
01698             }
01699             blockid = ++ctx->blockid[lod];
01700             for (; close < endclose; ++close) {
01701                 assign_block(ctx, cinfo[close].node, blockid, lod);
01702             }
01703         }
01704     }
01705 }
01706 
01707 /*  Called on a node when all points belonging to it have been accounted for.
01708  */
01709 static void emit_node(struct mem_node * const node,
01710                       struct tree_gen_context * const ctx)
01711 {
01712     int c;
01713     if (node->status > NODE_INIT) {
01714         return;
01715     }
01716     /* visit children */
01717     for (c = 0; c < 4; ++c) {
01718         if (node->child[c] != NULL) {
01719             emit_node(node->child[c], ctx);
01720         }
01721     }
01722     if (node->count < ctx->leafthresh) {
01723         /* if the node point count is too small,
01724            make it a leaf by deleting all children. */
01725         for (c = 0; c < 4; ++c) {
01726             if (node->child[c] != NULL) {
01727 #if FAST_ALLOC
01728                 arena_free(&ctx->ar, node->child[c]);
01729 #else
01730                 free(node->child[c]);
01731 #endif
01732                 node->child[c] = NULL;
01733             }
01734         }
01735         node->status = NODE_EMITTED;
01736     } else {
01737         /* otherwise, layout the subtree rooted at node */
01738         layout_node(node, ctx);
01739     }
01740 }
01741 
01742 /*  Adds a new node to root (one of S0,S1,S2,S3,N0,N1,N2,N3).
01743  */
01744 static void add_node(struct mem_node * const root,
01745                      struct tree_gen_context * const ctx,
01746                      int64_t htmid,
01747                      int64_t count,
01748                      int64_t index)
01749 {
01750     struct mem_node *node;
01751     int lvl = 0;
01752 
01753     for (lvl = 0, node = root; lvl < 20; ++lvl) {
01754         /* keep subdividing */
01755         int i, c;
01756         node->count += count;
01757         c = (htmid >> 2*(19 - lvl)) & 3;
01758         for (i = 0; i < c; ++i) {
01759             struct mem_node *tmp = node->child[i];
01760             if (tmp != NULL && tmp->status == NODE_INIT) {
01761                 emit_node(node->child[i], ctx);
01762             }
01763         }
01764         index -= node->index; /* relativize index */
01765         if (node->child[c] != NULL) {
01766             node = node->child[c];
01767         } else {
01768             /* create child node */
01769 #if FAST_ALLOC
01770             struct mem_node *child = (struct mem_node *) arena_alloc(&ctx->ar);
01771 #else
01772             struct mem_node *child =
01773                 (struct mem_node *) malloc(sizeof(struct mem_node));
01774             if (child == NULL) {
01775                 err("malloc() failed");
01776             }
01777 #endif
01778             memset(child, 0, sizeof(struct mem_node));
01779             child->index = index;
01780             child->htmid = node->htmid*4 + c;
01781             node->child[c] = child;
01782             node = child;
01783         }
01784     }
01785     assert(node->htmid == htmid);
01786     node->count = count;
01787 }
01788 
01789 
01790 /*  Container for the 8 level-0 HTM tree nodes.
01791  */
01792 struct tree_root {
01793     uint64_t count; /* Total number of points in tree */
01794     struct mem_node *child[8];
01795     struct node_id childid[8];
01796 };
01797 
01798 
01799 /*  Assigns block IDs to the HTM root nodes.
01800  */
01801 static void finish_root(struct tree_root * const super,
01802                         struct tree_gen_context * const ctx)
01803 {
01804     struct child_info cinfo[8];
01805     int c, close, nchild, lod;
01806 
01807     for (c = 0, nchild = 0; c < 8; ++c) {
01808         struct mem_node *tmp = super->child[c];
01809         if (tmp != NULL) {
01810             super->count += tmp->count;
01811             cinfo[nchild].node = tmp;
01812             cinfo[nchild].idx = c;
01813             ++nchild;
01814         }
01815     }
01816     /* Clark & Munro for each level of detail */
01817     for (lod = 0; lod < NLOD; ++lod) {
01818         uint64_t blockid;
01819         uint32_t totsz;
01820         for (c = 0; c < nchild; ++c) {
01821             struct mem_node *tmp = cinfo[c].node;
01822             cinfo[c].size = get_block_size(tmp->blockinfo[lod]);
01823             cinfo[c].depth = get_block_depth(tmp->blockinfo[lod]);
01824         }
01825         child_info_isort(cinfo, nchild);
01826         /* scan children from smallest to largest, merging
01827            as many as possible into blocks. */
01828         close = 0;
01829         totsz = cinfo[0].size;
01830         for (c = 1; c < nchild; ++c) {
01831             if (totsz + cinfo[c].size > layout_size[lod]) {
01832                 blockid = ++ctx->blockid[lod];
01833                 for (; close < c; ++close) {
01834                     assign_block(ctx, cinfo[close].node, blockid, lod);
01835                 }
01836                 totsz = cinfo[c].size;
01837             }
01838         }
01839         blockid = ++ctx->blockid[lod];
01840         for (; close < nchild; ++close) {
01841             assign_block(ctx, cinfo[close].node, blockid, lod);
01842         }
01843     }
01844     /* At this point, all nodes are guaranteed to have been written to disk.
01845        Copy HTM root node IDs to the super root and then throw them away. */
01846     for (c = 0; c < nchild; ++c) {
01847         super->childid[cinfo[c].idx] = cinfo[c].node->id;
01848 #if FAST_ALLOC
01849         arena_free(&ctx->ar, cinfo[c].node);
01850 #else
01851         free(cinfo[c].node);
01852 #endif
01853         super->child[cinfo[c].idx] = NULL;
01854     }
01855 }
01856 
01857 
01858 /*  Tree generation driver function.
01859  */
01860 static size_t tree_gen(const char * const datafile,
01861                        const char * const treefile,
01862                        const struct mem_params * const mem,
01863                        struct tree_root * const super,
01864                        const uint64_t leafthresh,
01865                        const size_t npoints)
01866 {
01867     struct tree_gen_context ctx;
01868     const struct tree_entry *data;
01869     void *behind;
01870     int64_t htmid;
01871     uint64_t index, count;
01872     size_t i, nseg;
01873     int r, fd;
01874     const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE);
01875     size_t mapsz = npoints * sizeof(struct tree_entry);
01876     double t;
01877 
01878     if (npoints == 0) {
01879         err("no input points");
01880     }
01881     msg("Generating block sorted tree node file %s from %s\n",
01882         treefile, datafile);
01883     t = now();
01884     fd = open(datafile, O_RDONLY);
01885     if (fd == -1) {
01886         err("failed to open file %s for reading", datafile);
01887     }
01888     if (mapsz % pagesz != 0) {
01889         mapsz += pagesz - mapsz % pagesz;
01890     }
01891     behind = mmap(NULL, mapsz, PROT_READ, MAP_SHARED | MAP_NORESERVE, fd, 0);
01892     if (behind == MAP_FAILED) {
01893         err("mmap() file %s for reading", datafile);
01894     }
01895     if (madvise(behind, mapsz, MADV_SEQUENTIAL) != 0) {
01896         err("madvise() failed on mmap for file %s", datafile);
01897     }
01898     data = (const struct tree_entry *) behind;
01899     behind = ((unsigned char *) behind) + mem->ioblksz;
01900     tree_gen_context_init(&ctx, leafthresh, treefile, mem->sortsz);
01901     memset(super, 0, sizeof(struct tree_root));
01902 
01903     /* walk over tree entries, adding tree nodes. */
01904     if (data[0].htmid == 0) {
01905         err("invalid HTM ID");
01906     }
01907     htmid = 0;
01908     index = 0;
01909     count = 0;
01910     r = -1;
01911     for (i = 0; i < npoints; ++i) {
01912         if ((void *) &data[i] > behind) {
01913             void *ptr = ((unsigned char *) behind) - mem->ioblksz;
01914             if (madvise(ptr, mem->ioblksz, MADV_DONTNEED) != 0) {
01915                 err("madvise() failed");
01916             }
01917             behind = ((unsigned char *) behind) + mem->ioblksz;
01918         }
01919         if (data[i].htmid == htmid) {
01920             /* increase point count for the current htmid */
01921             ++count;
01922         } else {
01923             int r2;
01924             assert(data[i].htmid > htmid);
01925             if (r >= 0) {
01926                 /* add previous node if there is one */
01927                 add_node(super->child[r], &ctx, htmid, count, (uint64_t) index);
01928             }
01929             /* reset index, count, and htmid */
01930             count = 1;
01931             index = (uint64_t) i;
01932             htmid = data[i].htmid;
01933             r2 = (int) (htmid >> 40) - 8;
01934             if (r2 < 0 || r2 > 7) {
01935                 err("invalid HTM ID");
01936             }
01937             if (r != r2) {
01938                 /* need a new HTM root node */
01939                 if (r >= 0) {
01940                     /* emit and layout the previous root if there is one */
01941                     emit_node(super->child[r], &ctx);
01942                     layout_node(super->child[r], &ctx);
01943                 }
01944                 r = r2;
01945                 /* create new root */
01946 #if FAST_ALLOC
01947                 super->child[r] = (struct mem_node *) arena_alloc(&ctx.ar);
01948 #else
01949                 super->child[r] =
01950                     (struct mem_node *) malloc(sizeof(struct mem_node));
01951                 if (super->child[r] == NULL) {
01952                     err("malloc() failed");
01953                 }
01954 #endif
01955                 memset(super->child[r], 0, sizeof(struct mem_node));
01956                 super->child[r]->htmid = r + 8;
01957                 super->child[r]->index = index;
01958             }
01959         }
01960     }
01961     /* add last node, emit and layout last root */
01962     add_node(super->child[r], &ctx, htmid, count, (uint64_t) index);
01963     emit_node(super->child[r], &ctx);
01964     layout_node(super->child[r], &ctx);
01965 
01966     /* assign block IDs to the HTM roots */
01967     finish_root(super, &ctx);
01968     if (super->count != npoints) {
01969         err("bug in tree generation phase - points not all accounted for");
01970     }
01971     /* cleanup */
01972     i = ctx.nnodes;
01973     nseg = ctx.ar.nseg;
01974     tree_gen_context_destroy(&ctx);
01975     if (munmap((void *) data, mapsz) != 0) {
01976         err("munmap() failed");
01977     }
01978     if (close(fd) != 0) {
01979         err("close() failed");
01980     }
01981     msg("\t%.3f sec total (%llu tree nodes, %llu MiB memory)\n\n",
01982         now() - t, (unsigned long long) i,
01983         (unsigned long long) (nseg*ARENA_SEGSZ)/(1024*1024));
01984     return i;
01985 }
01986 
01987 
01988 /* ================================================================ */
01989 /*                    Phase 3: Tree compression                     */
01990 /* ================================================================ */
01991 
01992 /*  Mapping from a node ID to a relative file offset.
01993  */
01994 struct id_off {
01995     struct node_id id;
01996     uint64_t off;
01997     struct id_off *next;
01998 } HTM_ALIGNED(16);
01999 
02000 /*  Simple hash table that maps node IDs to relative file offsets. The
02001     implementation uses a power-of-2 sized backing array and chains on
02002     collision. The hash-code of a node ID is taken to be its post-order
02003     index (which is unique).
02004  */
02005 struct hash_table {
02006     size_t n;
02007     size_t cap;
02008     struct id_off **array;
02009 #if FAST_ALLOC
02010     struct arena ar;
02011 #endif
02012 };
02013 
02014 
02015 static void hash_table_init(struct hash_table *ht)
02016 {
02017     ht->cap = 65536; /* must be a power of 2 */
02018     ht->n = 0;
02019     ht->array = (struct id_off **) malloc(ht->cap * sizeof(struct id_off *));
02020     if (ht->array == NULL) {
02021         err("malloc() failed");
02022     }
02023     memset(ht->array, 0, ht->cap * sizeof(struct id_off *));
02024 #if FAST_ALLOC
02025     arena_init(&ht->ar, sizeof(struct id_off));
02026 #endif
02027 }
02028 
02029 
02030 static void hash_table_destroy(struct hash_table *ht)
02031 {
02032 #if FAST_ALLOC
02033     arena_destroy(&ht->ar);
02034 #else
02035     size_t i;
02036     struct id_off *ptr;
02037     for (i = 0; i < ht->cap; ++i) {
02038         ptr = ht->array[i];
02039         while (ptr != NULL) {
02040             struct id_off *next = ptr->next;
02041             free(ptr);
02042             ptr = next;
02043         }
02044     }
02045 #endif
02046     free(ht->array);
02047     ht->array = NULL;
02048     ht->n = 0;
02049     ht->cap = 0;
02050 }
02051 
02052 
02053 /*  Grows hash table capacity by a factor of two.
02054  */
02055 static void hash_table_grow(struct hash_table *ht) {
02056     size_t i, cap = ht->cap;
02057     struct id_off **array = ht->array;
02058 
02059     ht->cap = 2*cap;
02060     ht->array = (struct id_off **) malloc(ht->cap * sizeof(struct id_off *));
02061     if (ht->array == NULL) {
02062         err("malloc() failed");
02063     }
02064     memset(ht->array, 0, ht->cap * sizeof(struct id_off *));
02065 
02066     /* add previous hash table entries */
02067     for (i = 0; i < cap; ++i) {
02068         struct id_off *e = array[i];
02069         while (e != NULL) {
02070             struct id_off *next = e->next;
02071             size_t j = (2*cap - 1) & (size_t) e->id.block[NLOD];
02072             e->next = ht->array[j];
02073             ht->array[j] = e;
02074             e = next;
02075         }
02076     }
02077 }
02078 
02079 
02080 /*  Adds an id to offset mapping to the given hash table.
02081  */
02082 static void hash_table_add(struct hash_table * const ht,
02083                            const struct node_id * const id,
02084                            uint64_t off)
02085 {
02086     struct id_off *e;
02087     size_t i;
02088     if ((ht->n*3)/4 > ht->cap) {
02089         hash_table_grow(ht);
02090     }
02091     i = (ht->cap - 1) & (size_t) id->block[NLOD];
02092 #if FAST_ALLOC
02093     e = (struct id_off *) arena_alloc(&ht->ar);
02094 #else
02095     e = (struct id_off *) malloc(sizeof(struct id_off));
02096     if (e == NULL) {
02097         err("malloc() failed");
02098     }
02099 #endif
02100     e->id = *id;
02101     e->off = off;
02102     e->next = ht->array[i];
02103     ht->array[i] = e;
02104     ++ht->n;
02105 }
02106 
02107 
02108 /*  Returns the offset of the node with the given ID and removes
02109     the corresponding hash table entry.
02110  */
02111 static uint64_t hash_table_get(struct hash_table * const ht,
02112                                const struct node_id * const id)
02113 {
02114     struct id_off *e, *prev;
02115     const size_t i = (ht->cap - 1) & (size_t) id->block[NLOD];
02116     prev = NULL;
02117     e = ht->array[i];
02118     /* child must occur before parent */
02119     while (1) {
02120         if (e == NULL) {
02121             err("tree generation bug: parent node written before child");
02122         }
02123         if (node_id_eq(id, &e->id)) {
02124             uint64_t off = e->off;
02125             if (prev == NULL) {
02126                 ht->array[i] = e->next;
02127             } else {
02128                 prev->next = e->next;
02129             }
02130 #if FAST_ALLOC
02131             arena_free(&ht->ar, e);
02132 #else
02133             free(e);
02134 #endif
02135             --ht->n;
02136             return off;
02137         }
02138         prev = e;
02139         e = e->next;
02140     }
02141     /* never reached */
02142 }
02143 
02144 
02145 /*  Writes out a tree node (byte reversed).
02146  */
02147 static uint64_t compress_node(struct hash_table * const ht,
02148                               struct blk_writer * const wr,
02149                               const struct disk_node * const n,
02150                               const uint64_t filesz,
02151                               const uint64_t leafthresh)
02152 {
02153     unsigned char buf[48];
02154     unsigned char *s = buf;
02155     uint64_t sz = filesz;
02156     unsigned int v;
02157     int c, leaf;
02158 
02159     /* write out child offsets (from child 3 to child 0) */
02160     for (c = 3, leaf = 1; c >= 0; --c) {
02161         if (node_empty(&n->child[c])) {
02162             *s = 0;
02163             ++s;
02164             ++sz;
02165         } else {
02166             /* this is tricky - child 3 of n can be laid out immediately after
02167                n, yielding a child offset of 0. But 0 also means "empty child",
02168                so instead encode the actual offset + 1. */
02169             v = htm_varint_rencode(s, sz + 1 - hash_table_get(ht, &n->child[c]));
02170             s += v;
02171             sz += v;
02172             leaf = 0;
02173         }
02174     }
02175     if (leaf != 0) {
02176         /* n is a leaf: don't store child offsets */
02177         s -= 4;
02178         sz -= 4;
02179     } else if (n->count < leafthresh) {
02180         err("tree generation bug: internal node contains too few points");
02181     }
02182     /* write out relative index, then count */
02183     v = htm_varint_rencode(s, n->index);
02184     s += v;
02185     sz += v;
02186     v = htm_varint_rencode(s, n->count);
02187     s += v;
02188     sz += v;
02189     /* write out byte reversed node, add node id to hashtable */
02190     blk_writer_append(wr, buf, (size_t) (s - buf), NULL);
02191     hash_table_add(ht, &n->id, sz);
02192     return sz;
02193 }
02194 
02195 
02196 /*  Writes out a tree header (byte reversed).
02197  */
02198 static uint64_t write_tree_header(struct hash_table * const ht,
02199                                   struct blk_writer * const wr,
02200                                   const struct tree_root * const super,
02201                                   const uint64_t filesz,
02202                                   const uint64_t leafthresh)
02203 {
02204     unsigned char buf[96];
02205     unsigned char *s;
02206     uint64_t sz;
02207     int r;
02208     unsigned int v;
02209 
02210     /* write offsets of N3, N2, N1, N0, S3, S2, S1, S0 */
02211     for (r = 7, s = buf, sz = filesz; r >= 0; --r) {
02212         if (node_empty(&super->childid[r])) {
02213             *s = 0;
02214             ++s;
02215             ++sz;
02216         } else {
02217             /* N3 could be laid out immediately after super root,
02218                yielding a child offset of 0, which means "empty child".
02219                Therefore encode 1 + actual offset. */
02220             v = htm_varint_rencode(
02221                 s, sz + 1 - hash_table_get(ht, &super->childid[r]));
02222             s += v;
02223             sz += v;
02224         }
02225     }
02226     /* write total number of points in tree */
02227     v = htm_varint_rencode(s, super->count);
02228     s += v;
02229     sz += v;
02230     /* write leaf threshold */
02231     v = htm_varint_rencode(s, leafthresh);
02232     s += v;
02233     sz += v;
02234     blk_writer_append(wr, buf, (size_t) (s - buf), NULL);
02235     if (ht->n != 0) {
02236         err("tree compression bug: node id hash table non-empty");
02237     }
02238     return sz;
02239 }
02240 
02241 
02242 /*  Tree compression driver function.
02243  */
02244 static uint64_t tree_compress(const char * const treefile,
02245                               const char * const scratchfile,
02246                               const struct mem_params * const mem,
02247                               const struct tree_root * const super,
02248                               const size_t nnodes,
02249                               const uint64_t leafthresh)
02250 {
02251     struct hash_table ht;
02252     struct blk_writer *wr;
02253     const struct disk_node *data;
02254     void *behind;
02255     double t;
02256     size_t i;
02257     uint64_t filesz;
02258     int fd;
02259     const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE);
02260     size_t mapsz = nnodes * sizeof(struct disk_node);
02261 
02262     if (nnodes == 0) {
02263         err("no input nodes");
02264     }
02265     t = now();
02266     msg("Generating reversed compressed tree node file %s from %s\n",
02267         scratchfile, treefile);
02268     fd = open(treefile, O_RDONLY);
02269     if (fd == -1) {
02270         err("failed to open file %s for reading", treefile);
02271     }
02272     if (mapsz % pagesz != 0) {
02273         mapsz += pagesz - mapsz % pagesz;
02274     }
02275     behind = mmap(NULL, mapsz, PROT_READ, MAP_SHARED | MAP_NORESERVE, fd, 0);
02276     if (behind == MAP_FAILED) {
02277         err("mmap() file %s for reading", treefile);
02278     }
02279     if (madvise(behind, mapsz, MADV_SEQUENTIAL) != 0) {
02280         err("madvise() failed on mmap for file %s", treefile);
02281     }
02282     data = (const struct disk_node *) behind;
02283     behind = ((unsigned char *) behind) + mem->ioblksz;
02284     hash_table_init(&ht);
02285     wr = blk_writer_init(scratchfile, mem->ioblksz);
02286 
02287     /* write nodes */
02288     for (i = 0, filesz = 0; i < nnodes; ++i) {
02289         if (i > 0) {
02290             assert(disk_node_lt(&data[i - 1], &data[i]) &&
02291                    "tree node file not sorted");
02292         }
02293         if ((void *) &data[i] > behind) {
02294             void *ptr = ((unsigned char *) behind) - mem->ioblksz;
02295             if (madvise(ptr, mem->ioblksz, MADV_DONTNEED) != 0) {
02296                 err("madvise() failed");
02297             }
02298             behind = ((unsigned char *) behind) + mem->ioblksz;
02299         }
02300         filesz = compress_node(&ht, wr, &data[i], filesz, leafthresh);
02301     }
02302     /* and tree header */
02303     filesz = write_tree_header(&ht, wr, super, filesz, leafthresh);
02304     /* cleanup */
02305     blk_writer_close(wr, NULL);
02306     hash_table_destroy(&ht);
02307     if (munmap((void *) data, mapsz) != 0) {
02308         err("munmap() failed");
02309     }
02310     if (close(fd) != 0) {
02311         err("close() failed");
02312     }
02313     msg("\t%.3f sec total\n\n", now() - t);
02314     return filesz;
02315 }
02316 
02317 
02318 /*  Byte reverses an input file to an output file and removes the input file.
02319  */
02320 static void reverse_file(const char * const infile,
02321                          const char * const outfile,
02322                          const struct mem_params * const mem,
02323                          const uint64_t filesz)
02324 {
02325     const unsigned char *data;
02326     struct blk_writer *wr;
02327     double t;
02328     uint64_t i, j;
02329     int fd;
02330     const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE);
02331     size_t mapsz = filesz;
02332 
02333     if (filesz == 0) {
02334         err("cannot reverse an empty file");
02335     }
02336     msg("Reversing file %s to produce %s\n", infile, outfile);
02337     t = now();
02338     fd = open(infile, O_RDONLY);
02339     if (fd == -1) {
02340         err("failed to open file %s for reading", infile);
02341     }
02342     if (mapsz % pagesz != 0) {
02343         mapsz += pagesz - mapsz % pagesz;
02344     }
02345     data = (const unsigned char *) mmap(NULL, mapsz, PROT_READ,
02346                                         MAP_SHARED | MAP_NORESERVE, fd, 0);
02347     if ((void *) data == MAP_FAILED) {
02348         err("failed to mmap() file %s", infile);
02349     }
02350     wr = blk_writer_init(outfile, mem->ioblksz);
02351     j = mem->ioblksz * (filesz / mem->ioblksz);
02352     if (j == filesz) {
02353         j -= mem->ioblksz;
02354     }
02355     for (i = filesz - 1; i > 0; --i) {
02356         blk_writer_append(wr, &data[i], 1, NULL);
02357         if (i == j) {
02358             size_t sz = filesz - j;
02359             if (sz > mem->ioblksz) {
02360                 sz = mem->ioblksz;
02361             }
02362             madvise((void *) &data[i], sz, MADV_DONTNEED);
02363             j -= mem->ioblksz;
02364         }
02365     }
02366     blk_writer_append(wr, data, 1, NULL);
02367     blk_writer_close(wr, NULL);
02368     if (munmap((void *) data, mapsz) != 0) {
02369         err("munmap() failed");
02370     }
02371     if (close(fd) != 0) {
02372         err("close() failed");
02373     }
02374     if (unlink(infile) != 0) {
02375         err("failed to delete file %s", infile);
02376     }
02377     msg("\t%.3f sec total\n\n", now() - t);
02378 }
02379 
02380 
02381 /* ================================================================ */
02382 /*        Phase 4: Convert spherical coords to unit vectors         */
02383 /* ================================================================ */
02384 
02385 static void spherical_to_vec(const char * const datafile,
02386                              const char * const scratchfile,
02387                              const struct mem_params * const mem,
02388                              const size_t npoints)
02389 {
02390     const struct tree_entry *data;
02391     struct blk_writer *wr;
02392     void *behind;
02393     double t;
02394     size_t i;
02395     int fd;
02396     const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE);
02397     size_t mapsz = sizeof(struct tree_entry) * npoints;
02398 
02399     if (mapsz == 0) {
02400         err("Empty data file");
02401     }
02402     msg("Converting spherical coordinates in %s to unit vectors\n", datafile);
02403     t = now();
02404     fd = open(datafile, O_RDONLY);
02405     if (fd == -1) {
02406         err("failed to open file %s for reading", datafile);
02407     }
02408     if (mapsz % pagesz != 0) {
02409         mapsz += pagesz - mapsz % pagesz;
02410     }
02411     behind = mmap(NULL, mapsz, PROT_READ, MAP_SHARED | MAP_NORESERVE, fd, 0);
02412     if (behind == MAP_FAILED) {
02413         err("failed to mmap() file %s", datafile);
02414     }
02415     if (madvise(behind, mapsz, MADV_SEQUENTIAL) != 0) {
02416         err("madvise() failed");
02417     }
02418     data = (const struct tree_entry *) behind;
02419     behind = ((unsigned char *) behind) + mem->ioblksz;
02420     wr = blk_writer_init(scratchfile, mem->ioblksz);
02421     for (i = 0; i < npoints; ++i) {
02422         struct htm_tree_entry e;
02423         if ((void *) &data[i] > behind) {
02424             void *ptr = ((unsigned char *) behind) - mem->ioblksz;
02425             if (madvise(ptr, mem->ioblksz, MADV_DONTNEED) != 0) {
02426                 err("madvise() failed");
02427             }
02428             behind = ((unsigned char *) behind) + mem->ioblksz;
02429         }
02430         htm_sc_tov3(&e.v, &data[i].sc);
02431         e.rowid = data[i].rowid;
02432         blk_writer_append(wr, &e, sizeof(struct htm_tree_entry), NULL);
02433     }
02434     blk_writer_close(wr, NULL);
02435     if (munmap((void *) data, mapsz) != 0) {
02436         err("munmap() failed");
02437     }
02438     if (close(fd) != 0) {
02439         err("close() failed");
02440     }
02441     if (rename(scratchfile, datafile) != 0) {
02442         err("failed to rename file %s to %s", scratchfile, datafile);
02443     }
02444     msg("\t%.3f sec total\n\n", now() - t);
02445 }
02446 
02447 
02448 /* ================================================================ */
02449 /*                             main()                               */
02450 /* ================================================================ */
02451 
02452 static void usage(const char *prog) {
02453     printf("%s [options] <out_path> <in_file_1> [<in_file_2> ...]\n"
02454            "\n"
02455            "    This utility takes character-separated-value files\n"
02456            "containing spherical coordinates, and outputs an HTM tree index\n"
02457            "useful for fast point-in-spherical-region counting. The first\n"
02458            "column of the input files must correspond to unique row IDs\n"
02459            "(64-bit integers), the second to right ascension/longitude angles\n"
02460            "(in degrees), and the third to declination/latitude angles (also\n"
02461            "in degrees). Additional columns are allowed but ignored.\n"
02462            "\n"
02463            "    Typically, two output files named <out_path>.htm and\n"
02464            "<out_path>.dat are generated, though the first may optionally\n"
02465            "be omitted.\n"
02466            "\n"
02467            "    The first is a tree index, and the second contains points\n"
02468            "sorted by level 20 HTM ID. If the inputs contain no more than a\n"
02469            "configurable number of points, the tree index is omitted. When is\n"
02470            "this useful? For small data-sets, scanning the point file and\n"
02471            "testing each point for spherical-region membership can be faster\n"
02472            "than searching a tree. This is because tree searching has some\n"
02473            "overhead, and for sufficiently small data sets, a single page\n"
02474            "on disk will contain a significant fraction of the points. In\n"
02475            "this case, using a tree to reduce the number of points tested\n"
02476            "against a region fails to reduce IO compared to a scan.\n"
02477            "\n"
02478            "== Options ====\n"
02479            "\n"
02480            "--help        |-h        :  Prints usage information.\n"
02481            "--blk-size    |-b <int>  :  IO block size in KiB. The default is\n"
02482            "                            1024 KiB.\n"
02483            "--delim       |-d <char> :  The separator character to use when\n"
02484            "                            parsing input files. The default is '|'.\n"
02485            "--max-mem     |-m <int>  :  Approximate memory usage limit in MiB.\n"
02486            "                            The default is 512 MiB. Note that this\n"
02487            "                            applies only to various external sorts,\n"
02488            "                            and that tree generation can use up to\n"
02489            "                            about 512MiB of memory regardless.\n"
02490            "--tree-min    |-t <int>  :  If the total number of input points does\n"
02491            "                            not exceed the specified number, tree\n"
02492            "                            index generation is skipped. The default\n"
02493            "                            is 1024.\n"
02494            "--leaf-thresh |-l <int>  :  Minimum number of points in an internal\n"
02495            "                            tree node; defaults to 64.\n",
02496            prog);
02497 }
02498 
02499 
02500 int main(int argc, char **argv)
02501 {
02502     struct mem_params mem;
02503     char **infile;
02504     char *treefile, *datafile, *scratch;
02505     int nfile;
02506     size_t len, npoints, nnodes;
02507     size_t memsz = 512*1024*1024;
02508     size_t ioblksz = 1024*1024;
02509     size_t minpoints = 1024;
02510     uint64_t leafthresh = 64;
02511     char delim = '|';
02512 
02513     while (1) {
02514         static struct option long_options[] = {
02515               { "help",        no_argument,       0, 'h' },
02516               { "blk-size",    required_argument, 0, 'b' },
02517               { "delim",       required_argument, 0, 'd' },
02518               { "max-mem",     required_argument, 0, 'm' },
02519               { "tree-min",    required_argument, 0, 't' },
02520               { "leaf-thresh", required_argument, 0, 'l' },
02521               { 0, 0, 0, 0 }
02522         };
02523         unsigned long long v;
02524         char *endptr;
02525         int option_index = 0;
02526         int c = getopt_long(argc, argv, "hb:d:l:m:t:",
02527                             long_options, &option_index);
02528         if (c == -1) {
02529             break; /* no more options */
02530         }
02531         switch (c) {
02532             case 'h':
02533                 usage(argv[0]);
02534                 return EXIT_SUCCESS;
02535             case 'b':
02536                 v = strtoull(optarg, &endptr, 0);
02537                 if (endptr == optarg || errno != 0 || v < 1 || v > 1024*1024) {
02538                     err("--blk-size invalid. Please specify an integer between "
02539                         "1 and 1,048,576 (KiB).");
02540                 }
02541                 ioblksz = (size_t) v*1024;
02542                 break;
02543             case 'd':
02544                 if (strlen(optarg) != 1) {
02545                     err("--delim|-d argument must be a single character");
02546                 }
02547                 delim = optarg[0];
02548                 if (delim == '\0' ||
02549                     strchr("\n.+-eE0123456789aAfFiInN", delim) != NULL) {
02550                     err("--delim invalid: '%c'", delim);
02551                 }
02552                 break;
02553             case 'l':
02554                 v = strtoull(optarg, &endptr, 0);
02555                 if (endptr == optarg || errno != 0 || v < 1 || v > 1024*1024) {
02556                     err("--leaf-max invalid. Please specify an integer between "
02557                         "1 and 1,048,576.");
02558                 }
02559                 leafthresh = (uint64_t) v;
02560                 break;
02561             case 'm':
02562                 v = strtoull(optarg, &endptr, 0);
02563                 if (endptr == optarg || errno != 0 ||
02564                     v < 1 || v > SIZE_MAX/2097152) {
02565                     err("--max-mem invalid. Please specify an integer between "
02566                         "1 and %llu (MiB).",
02567                         (unsigned long long) SIZE_MAX/2097152);
02568                 }
02569                 memsz = (size_t) v*1024*1024;
02570                 break;
02571             case 't':
02572                 v = strtoull(optarg, &endptr, 0);
02573                 if (endptr == optarg || errno != 0 || v > SIZE_MAX) {
02574                     err("--tree-min invalid. Please specify a non-negative "
02575                         "integer less than or equal to %llu",
02576                         (unsigned long long) SIZE_MAX);
02577                 }
02578                 minpoints = (size_t) v;
02579                 break;
02580             case '?':
02581                 return EXIT_FAILURE;
02582             default:
02583                 abort();
02584         }
02585     }
02586     if (argc - optind < 2) {
02587         err("Output file and at least one input file must be specified");
02588     }
02589     len = strlen(argv[optind]);
02590     treefile = (char *) malloc(len + 5);
02591     datafile = (char *) malloc(len + 5);
02592     scratch = (char *) malloc(len + 5);
02593     if (treefile == NULL || datafile == NULL || scratch == NULL) {
02594         err("malloc() failed");
02595     }
02596     strcpy(treefile, argv[optind]);
02597     strcpy(datafile, argv[optind]);
02598     strcpy(scratch, argv[optind]);
02599     strcat(treefile, ".htm");
02600     strcat(datafile, ".dat");
02601     strcat(scratch, ".scr");
02602 
02603     ++optind;
02604     infile = &argv[optind];
02605     nfile = argc - optind;
02606     mem_params_init(&mem, memsz, ioblksz);
02607 
02608     printf("\n");
02609 
02610     /* Phase 1: produce sorted point file from ASCII inputs */
02611     npoints = blk_sort_ascii(infile, nfile, datafile, delim, &mem);
02612     ext_sort(datafile, scratch, &mem, sizeof(struct tree_entry), npoints,
02613              &tree_entry_cmp);
02614 
02615     if (npoints > minpoints) {
02616         struct tree_root super;
02617         uint64_t filesz;
02618         /* Phase 2: produce sorted tree nodes from data file */
02619         nnodes = tree_gen(datafile, treefile, &mem, &super,
02620                           leafthresh, npoints);
02621         ext_sort(treefile, scratch, &mem, sizeof(struct disk_node), nnodes,
02622                  &disk_node_cmp);
02623         /* Phase 3: compress tree file */
02624         filesz = tree_compress(treefile, scratch, &mem, &super,
02625                                nnodes, leafthresh);
02626         reverse_file(scratch, treefile, &mem, filesz);
02627     }
02628     /* Phase 4: convert spherical coords to unit vectors */
02629     spherical_to_vec(datafile, scratch, &mem, npoints);
02630     free(scratch);
02631     free(datafile);
02632     free(treefile);
02633     return EXIT_SUCCESS;
02634 }
02635 
02638 #ifdef __cplusplus
02639 }
02640 #endif