The Tiny HTM Library
|
00001 00197 #include <assert.h> 00198 #include <ctype.h> 00199 #include <errno.h> 00200 #include <stdarg.h> 00201 #include <stdio.h> 00202 #include <stdlib.h> 00203 #include <string.h> 00204 #include <sys/types.h> 00205 #include <sys/stat.h> 00206 #include <sys/time.h> 00207 #include <sys/mman.h> 00208 #include <fcntl.h> 00209 #include <unistd.h> 00210 #include <getopt.h> 00211 #include <pthread.h> 00212 00213 #include "tinyhtm/geometry.h" 00214 #include "tinyhtm/htm.h" 00215 #include "tinyhtm/tree.h" 00216 #include "tinyhtm/varint.h" 00217 00218 #ifdef __cplusplus 00219 extern "C" { 00220 #endif 00221 00224 /* ================================================================ */ 00225 /* Utilities and plumbing */ 00226 /* ================================================================ */ 00227 00228 00229 /* Prints an error message to stderr and exits. 00230 */ 00231 static void err(const char *fmt, ...) 00232 { 00233 va_list ap; 00234 fprintf(stderr, "ERROR: "); 00235 va_start(ap, fmt); 00236 vfprintf(stderr, fmt, ap); 00237 va_end(ap); 00238 fprintf(stderr, "\n"); 00239 fflush(stderr); 00240 exit(EXIT_FAILURE); 00241 } 00242 00243 /* Prints a message to stdout and exits. 00244 */ 00245 static void msg(const char *fmt, ...) 00246 { 00247 va_list ap; 00248 va_start(ap, fmt); 00249 vprintf(fmt, ap); 00250 va_end(ap); 00251 fflush(stdout); 00252 } 00253 00254 /* Returns the number of seconds that have elapsed since the epoch. 00255 */ 00256 static double now() { 00257 struct timeval t; 00258 gettimeofday(&t, NULL); 00259 return ((double) t.tv_sec) + ((double) t.tv_usec) / 1.0e6; 00260 } 00261 00262 00263 /* ---- Memory and IO related parameters ---- */ 00264 00265 struct mem_params { 00266 size_t memsz; /* Max memory usage in bytes */ 00267 size_t sortsz; /* Size of blocks operated on by in-memory sorts */ 00268 size_t ioblksz; /* Size of IO blocks */ 00269 size_t k; /* Number of merge segments in one multi-way merge pass */ 00270 }; 00271 00272 static void mem_params_init(struct mem_params *mem, 00273 size_t total, 00274 size_t ioblksz) 00275 { 00276 const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE); 00277 if (total % (2*pagesz) != 0) { 00278 total += 2*pagesz - total % (2*pagesz); 00279 } 00280 if (ioblksz % pagesz != 0) { 00281 ioblksz += pagesz - ioblksz % pagesz; 00282 } 00283 if (total < 6*ioblksz) { 00284 total = 6*ioblksz; 00285 } 00286 mem->memsz = total; 00287 mem->sortsz = total / 2; 00288 mem->ioblksz = ioblksz; 00289 mem->k = (total - 2*ioblksz) / (2*ioblksz); 00290 } 00291 00292 00293 /* ---- Asynchronous block writer ---- */ 00294 00295 enum bk_write_state { 00296 BK_WRITE_START = 0, 00297 BK_WRITE_READY, 00298 BK_WRITE_BLK, 00299 BK_WRITE_EXIT, 00300 BK_WRITE_ERR 00301 }; 00302 00303 00304 struct blk_writer { 00305 size_t n; /* Number of bytes per block */ 00306 size_t i; /* Number of bytes in current block */ 00307 unsigned char *buf; /* Current block pointer */ 00308 void *mem; /* Space for 2 blocks of n tree entries */ 00309 size_t memsz; /* Size of mem in bytes */ 00310 int fd; /* File descriptor of file being written to */ 00311 enum bk_write_state state; 00312 unsigned char *wrbuf; 00313 size_t wrbytes; 00314 pthread_attr_t attr; 00315 pthread_mutex_t mtx; 00316 pthread_cond_t cv; 00317 pthread_t thr; 00318 }; 00319 00320 00321 /* Writes data blocks in the background. 00322 */ 00323 static void * bk_write(void *arg) 00324 { 00325 struct blk_writer *w = (struct blk_writer *) arg; 00326 pthread_mutex_lock(&w->mtx); 00327 while (1) { 00328 unsigned char *buf; 00329 size_t n; 00330 ssize_t b; 00331 /* signal readiness for a command */ 00332 w->state = BK_WRITE_READY; 00333 pthread_cond_signal(&w->cv); 00334 /* wait for a command */ 00335 pthread_cond_wait(&w->cv, &w->mtx); 00336 if (w->state == BK_WRITE_READY) { 00337 continue; /* nothing to do */ 00338 } else if (w->state == BK_WRITE_EXIT) { 00339 break; /* exit background writer thread */ 00340 } 00341 /* write a block */ 00342 buf = w->wrbuf; 00343 n = w->wrbytes; 00344 pthread_mutex_unlock(&w->mtx); 00345 while (n > 0) { 00346 b = write(w->fd, buf, n); 00347 if (b < 0) { 00348 if (errno != EINTR) { 00349 /* error - exit background writer thread */ 00350 pthread_mutex_lock(&w->mtx); 00351 w->state = BK_WRITE_ERR; 00352 goto done; 00353 } 00354 } else { 00355 buf += b; 00356 n -= (size_t) b; 00357 } 00358 } 00359 pthread_mutex_lock(&w->mtx); 00360 } 00361 done: 00362 pthread_cond_signal(&w->cv); 00363 pthread_mutex_unlock(&w->mtx); 00364 return NULL; 00365 } 00366 00367 00368 /* Creates a new block writer with the given block size. 00369 */ 00370 static struct blk_writer * blk_writer_init(const char * const file, 00371 size_t blksz) 00372 { 00373 struct blk_writer *w; 00374 const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE); 00375 00376 w = (struct blk_writer *) malloc(sizeof(struct blk_writer)); 00377 if (w == NULL) { 00378 err("malloc() failed"); 00379 } 00380 w->n = blksz; 00381 w->i = 0; 00382 w->fd = open(file, O_CREAT | O_TRUNC | O_APPEND | O_WRONLY, 00383 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); 00384 if (w->fd == -1) { 00385 err("failed to open file %s for writing", file); 00386 } 00387 w->memsz = 2 * blksz; 00388 if (w->memsz % pagesz != 0) { 00389 w->memsz += pagesz - w->memsz % pagesz; 00390 } 00391 w->mem = mmap(NULL, w->memsz, PROT_READ | PROT_WRITE, 00392 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 00393 if (w->mem == MAP_FAILED) { 00394 err("write buffer allocation via mmap() failed"); 00395 } 00396 w->buf = (unsigned char *) w->mem; 00397 w->state = BK_WRITE_START; 00398 w->wrbuf = NULL; 00399 w->wrbytes = 0; 00400 pthread_attr_init(&w->attr); 00401 pthread_attr_setdetachstate(&w->attr, PTHREAD_CREATE_JOINABLE); 00402 pthread_mutex_init(&w->mtx, NULL); 00403 pthread_cond_init(&w->cv, NULL); 00404 pthread_create(&w->thr, &w->attr, &bk_write, (void *) w); 00405 return w; 00406 } 00407 00408 00409 /* Issues (asynchronous) write of one block. 00410 */ 00411 static void blk_writer_issue(struct blk_writer * const w, 00412 void (*sortfn)(void *, size_t)) 00413 { 00414 if (sortfn != NULL) { 00415 (*sortfn)(w->buf, w->i); 00416 } 00417 pthread_mutex_lock(&w->mtx); 00418 /* wait until background writer thread is ready for a command */ 00419 while (w->state != BK_WRITE_READY && w->state != BK_WRITE_ERR) { 00420 pthread_cond_wait(&w->cv, &w->mtx); 00421 } 00422 if (w->state == BK_WRITE_ERR) { 00423 pthread_mutex_unlock(&w->mtx); 00424 err("background thread failed to write() disk block"); 00425 } 00426 /* issue write for the current block */ 00427 w->state = BK_WRITE_BLK; 00428 w->wrbuf = w->buf; 00429 w->wrbytes = w->i; 00430 pthread_cond_signal(&w->cv); 00431 pthread_mutex_unlock(&w->mtx); 00432 /* flip write buffer */ 00433 w->i = 0; 00434 if (w->buf == (unsigned char *) w->mem) { 00435 w->buf = ((unsigned char *) w->mem) + w->n; 00436 } else { 00437 w->buf = ((unsigned char *) w->mem); 00438 } 00439 } 00440 00441 00442 /* Closes writer w; all as yet unwritten data is flushed to disk first. 00443 00444 If a non-NULL sorting function pointer is supplied, unwritten 00445 data is sorted prior to being written. 00446 */ 00447 static void blk_writer_close(struct blk_writer * const w, 00448 void (*sortfn)(void *, size_t)) 00449 { 00450 blk_writer_issue(w, sortfn); 00451 pthread_mutex_lock(&w->mtx); 00452 /* wait until background writer thread is ready for a command */ 00453 while (w->state != BK_WRITE_READY && w->state != BK_WRITE_ERR) { 00454 pthread_cond_wait(&w->cv, &w->mtx); 00455 } 00456 if (w->state == BK_WRITE_ERR) { 00457 pthread_mutex_unlock(&w->mtx); 00458 err("background thread failed to write() disk block"); 00459 } 00460 /* issue thread exit command ... */ 00461 w->state = BK_WRITE_EXIT; 00462 pthread_cond_signal(&w->cv); 00463 pthread_mutex_unlock(&w->mtx); 00464 /* ... and wait for writer thread to terminate */ 00465 pthread_join(w->thr, NULL); 00466 /* clean up */ 00467 if (munmap(w->mem, w->memsz) != 0) { 00468 err("munmap() of write buffers failed"); 00469 } 00470 if (fdatasync(w->fd) != 0) { 00471 err("fdatasync() failed"); 00472 } 00473 if (close(w->fd) != 0) { 00474 err("file close() failed"); 00475 } 00476 pthread_cond_destroy(&w->cv); 00477 pthread_mutex_destroy(&w->mtx); 00478 pthread_attr_destroy(&w->attr); 00479 free(w); 00480 } 00481 00482 00483 /* Appends data to the given block writer. 00484 */ 00485 HTM_INLINE void blk_writer_append(struct blk_writer * const w, 00486 const void * const data, 00487 const size_t nbytes, 00488 void (*sortfn)(void *, size_t)) 00489 { 00490 size_t i = w->i; 00491 if (i + nbytes > w->n) { 00492 blk_writer_issue(w, sortfn); 00493 i = 0; 00494 } 00495 assert(i + nbytes <= w->n); 00496 memcpy(w->buf + i, data, nbytes); 00497 w->i = i + nbytes; 00498 } 00499 00500 00501 /* ---- Multi-way external merge sort ---- */ 00502 00503 /* A contiguous sequence of sorted items, dubbed a multi-way merge segment. 00504 */ 00505 struct mrg_seg { 00506 const void *cur; 00507 const void *end; 00508 const void *blk; 00509 }; 00510 00511 00512 /* Initializes a merge segment. 00513 */ 00514 static void mrg_seg_init(struct mrg_seg * const s, 00515 const void * start, 00516 const void * end, 00517 const size_t blksz) 00518 { 00519 size_t n; 00520 const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE); 00521 size_t nbytes = (const char *) end - (const char *) start; 00522 if (end <= start || blksz % pagesz != 0) { 00523 err("Invalid merge segment"); 00524 } 00525 s->cur = start; 00526 s->end = end; 00527 n = ((size_t) start) % pagesz; 00528 if (n != 0) { 00529 start = (const char *) start - n; 00530 nbytes += n; 00531 } 00532 n = (nbytes > 2*blksz ? 2*blksz : nbytes); 00533 if (n % pagesz != 0) { 00534 n += pagesz - n % pagesz; 00535 } 00536 s->blk = (const char *) start + blksz; 00537 if (madvise((void *) start, n, MADV_WILLNEED) != 0) { 00538 err("madvise() failed"); 00539 } 00540 } 00541 00542 00543 /* Handles prefetch and merge segment exhaustion. 00544 */ 00545 static int mrg_seg_advance(struct mrg_seg * const s, 00546 const size_t blksz, 00547 const void * const cur) 00548 { 00549 const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE); 00550 00551 if (cur == s->end) { 00552 void *start = (char *) s->blk - blksz; 00553 size_t n = (char *) s->end - (char *) start; 00554 if (n % pagesz != 0) { 00555 n += pagesz - n % pagesz; 00556 } 00557 if (madvise(start, n, MADV_DONTNEED) != 0) { 00558 err("madvise() failed"); 00559 } 00560 return 0; 00561 } 00562 assert(cur >= s->blk && cur < s->end); 00563 if (madvise((char *) s->blk - blksz, blksz, MADV_DONTNEED) != 0) { 00564 err("madvise() failed"); 00565 } 00566 s->cur = cur; 00567 s->blk = (const char *) s->blk + blksz; 00568 if (s->blk < s->end) { 00569 size_t n = (char *) s->end - (char *) s->blk; 00570 if (n >= blksz) { 00571 n = blksz; 00572 } else if (n % pagesz != 0) { 00573 n += pagesz - n % pagesz; 00574 } 00575 if (madvise((void *) s->blk, n, MADV_WILLNEED) != 0) { 00576 err("madvise() failed"); 00577 } 00578 } 00579 return 1; 00580 } 00581 00582 00583 /* Consumes one item in the given merge segment; returns 0 if there 00584 are no more items in the segment. 00585 */ 00586 HTM_INLINE int mrg_seg_consume(struct mrg_seg * const s, 00587 const size_t blksz, 00588 const size_t itemsz) 00589 { 00590 const void *cur = (const char *) s->cur + itemsz; 00591 if (cur < s->end && cur < s->blk) { 00592 s->cur = cur; 00593 return 1; 00594 } 00595 return mrg_seg_advance(s, blksz, cur); 00596 } 00597 00598 00599 /* Adds segs[n] to the min-heap segs[0], segs[1], ..., segs[n - 1]. 00600 */ 00601 static void heap_up(struct mrg_seg *segs, 00602 size_t n, 00603 int (*cmpfn)(const void *, const void *)) 00604 { 00605 struct mrg_seg tmp; 00606 size_t p; 00607 00608 while (n != 0) { 00609 p = (n - 1) / 2; 00610 if ((*cmpfn)(segs[p].cur, segs[n].cur) != 0) { 00611 break; 00612 } 00613 tmp = segs[p]; 00614 segs[p] = segs[n]; 00615 segs[n] = tmp; 00616 n = p; 00617 } 00618 } 00619 00620 00621 /* Fix possible min-heap violation by segs[0]. 00622 */ 00623 static void heap_down(struct mrg_seg *segs, 00624 const size_t n, 00625 int (*cmpfn)(const void *, const void *)) 00626 { 00627 struct mrg_seg tmp; 00628 size_t i; 00629 00630 if (n > 1) { 00631 i = 0; 00632 while (1) { 00633 size_t left = 2*i + 1; 00634 size_t right = 2*i + 2; 00635 size_t least = i; 00636 if (left < n && (*cmpfn)(segs[left].cur, segs[i].cur) != 0) { 00637 least = left; 00638 } 00639 if (right < n && (*cmpfn)(segs[right].cur, segs[least].cur) != 0) { 00640 least = right; 00641 } 00642 if (least == i) { 00643 break; 00644 } 00645 tmp = segs[i]; 00646 segs[i] = segs[least]; 00647 segs[least] = tmp; 00648 i = least; 00649 } 00650 } 00651 } 00652 00653 00654 /* Performs one multi-way merge pass. 00655 */ 00656 static void mrg_pass(struct blk_writer * const w, 00657 const void * const data, 00658 const struct mem_params * const mem, 00659 const size_t filesz, 00660 const size_t sortsz, 00661 const size_t itemsz, 00662 int (*cmpfn)(const void *, const void *)) 00663 { 00664 struct mrg_seg *segs; 00665 size_t start; 00666 00667 /* allocate merge segments */ 00668 segs = (struct mrg_seg *) malloc( 00669 mem->k * sizeof(struct mrg_seg)); 00670 if (segs == NULL) { 00671 err("malloc() failed"); 00672 } 00673 for (start = 0; start < filesz;) { 00674 size_t ns, end; 00675 /* initialize up to mem->k merge segments */ 00676 for (ns = 0; ns < mem->k && start < filesz; ++ns, start = end) { 00677 end = (start + sortsz > filesz ? filesz : start + sortsz); 00678 mrg_seg_init(&segs[ns], 00679 (const char *) data + start, 00680 (const char *) data + end, 00681 mem->ioblksz); 00682 heap_up(segs, ns, cmpfn); 00683 } 00684 /* merge ns segments */ 00685 while (ns > 0) { 00686 /* write minimum value from all ns merge segments to disk */ 00687 blk_writer_append(w, segs->cur, itemsz, NULL); 00688 if (mrg_seg_consume(segs, mem->ioblksz, itemsz) == 0) { 00689 segs[0] = segs[ns - 1]; 00690 --ns; 00691 } 00692 heap_down(segs, ns, cmpfn); 00693 } 00694 } 00695 free(segs); 00696 } 00697 00698 00699 /* Computes the number of k-way merge passes required to sort n items, 00700 i.e. the ceiling of the base-k logarithm of n. 00701 */ 00702 static int mrg_npasses(size_t n, size_t k) { 00703 int m = 1; 00704 size_t b = k; 00705 while (b < n) { 00706 b *= k; 00707 ++m; 00708 } 00709 return m; 00710 } 00711 00712 00713 /* Sorts the given file using multi-way external merge sort. 00714 00715 @param[in] file File to sort. Runs of size mem->sortb are assumed 00716 to be sorted already. 00717 @param[in] tmpl Temporary file name template. 00718 @param[in] mem Memory parameters. 00719 @param[in] itemsz Size of a single item in bytes. 00720 @param[in] nitems Number of items in file. 00721 */ 00722 static void ext_sort(const char *file, 00723 const char *scratch, 00724 const struct mem_params *mem, 00725 size_t itemsz, 00726 size_t nitems, 00727 int (*cmpfn)(const void *, const void *)) 00728 { 00729 const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE); 00730 const size_t filesz = nitems * itemsz; 00731 size_t sortsz = mem->sortsz - mem->sortsz % itemsz; 00732 const size_t nblk = filesz / sortsz + (filesz % sortsz != 0 ? 1u : 0u); 00733 const int nmp = mrg_npasses(nblk, mem->k); 00734 int mp; 00735 double ttot; 00736 00737 if (nblk < 2) { 00738 msg("Skipping multi-way merge step (%s already sorted)\n", file); 00739 return; 00740 } 00741 msg("Multi-way external merge sort of %s\n", file); 00742 ttot = now(); 00743 00744 /* Perform k-way merge passes; the sorted block grows by a 00745 factor of k on every pass */ 00746 for (mp = 0; mp < nmp; ++mp, sortsz *= mem->k) { 00747 const char *inf; 00748 const char *outf; 00749 const void *data; 00750 struct blk_writer *w; 00751 double t; 00752 size_t nmap; 00753 int infd; 00754 00755 msg("\t- merge pass %d/%d ... ", mp + 1, nmp); 00756 t = now(); 00757 00758 /* memory map input file and create writer for output file */ 00759 if (mp % 2 == 0) { 00760 inf = file; 00761 outf = scratch; 00762 } else { 00763 inf = scratch; 00764 outf = file; 00765 } 00766 infd = open(inf, O_RDONLY); 00767 if (infd == -1) { 00768 err("failed to open file %s for reading", inf); 00769 } 00770 if (filesz % pagesz != 0) { 00771 nmap = filesz + (pagesz - filesz % pagesz); 00772 } else { 00773 nmap = filesz; 00774 } 00775 data = mmap(NULL, nmap, PROT_READ, 00776 MAP_SHARED | MAP_NORESERVE, infd, 0); 00777 if (data == MAP_FAILED) { 00778 err("failed to mmap() file %s", inf); 00779 } 00780 if (madvise((void *) data, filesz, MADV_DONTNEED) != 0) { 00781 err("madvise() failed"); 00782 } 00783 w = blk_writer_init(outf, mem->ioblksz); 00784 /* perform merges */ 00785 mrg_pass(w, data, mem, filesz, sortsz, itemsz, cmpfn); 00786 /* cleanup */ 00787 blk_writer_close(w, NULL); 00788 if (munmap((void *) data, nmap) != 0) { 00789 err("failed to munmap() file %s", inf); 00790 } 00791 if (close(infd) != 0) { 00792 err("failed to close() file %s", inf); 00793 } 00794 msg("%.3f sec\n", now() - t); 00795 } 00796 /* make sure sorted results are in data file and delete scratch file. */ 00797 if (nmp % 2 == 1) { 00798 if (rename(scratch, file) != 0) { 00799 err("failed to rename file %s to %s", scratch, file); 00800 } 00801 } else { 00802 if (unlink(scratch) != 0) { 00803 err("failed to delete file %s", scratch); 00804 } 00805 } 00806 msg("\t%.3f sec total\n\n", now() - ttot); 00807 } 00808 00809 00810 /* ---- Fast constant size memory allocator ---- */ 00811 00812 /* Undefine FAST_ALLOC (or define it as 0) to allocate nodes with malloc() 00813 instead. Useful when checking memory safety, e.g. with valgrind. 00814 */ 00815 #define FAST_ALLOC 1 00816 00817 #if FAST_ALLOC 00818 # define ARENA_SEGSZ 16*1024*1024 00819 00820 /* A contiguous memory segment belonging to an arena. 00821 */ 00822 struct arena_seg { 00823 struct arena_seg *prev; 00824 void *mem; 00825 }; 00826 00827 static struct arena_seg * arena_seg_init(struct arena_seg * const prev, 00828 const size_t itemsz) 00829 { 00830 unsigned char *node, *end; 00831 struct arena_seg *seg = (struct arena_seg *) malloc(sizeof(struct arena_seg)); 00832 if (seg == NULL) { 00833 err("malloc() failed"); 00834 } 00835 seg->prev = prev; 00836 seg->mem = mmap(NULL, ARENA_SEGSZ, PROT_READ | PROT_WRITE, 00837 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 00838 if (seg->mem == MAP_FAILED) { 00839 err("mmap() failed"); 00840 } 00841 /* initialize free list */ 00842 node = (unsigned char *) seg->mem; 00843 end = node + (ARENA_SEGSZ/itemsz - 1)*itemsz; 00844 while (node < end) { 00845 *((void **) node) = node + itemsz; 00846 node += itemsz; 00847 } 00848 *((void **) node) = NULL; 00849 return seg; 00850 } 00851 00852 /* A (non-shrinkable) memory arena. Memory nodes can be freed one 00853 at a time, or en-masse. 00854 */ 00855 struct arena { 00856 struct arena_seg *tail; /* reverse linked list of memory segments */ 00857 struct mem_node *head; /* head of linked list of free memory locations */ 00858 size_t itemsz; 00859 size_t nseg; 00860 }; 00861 00862 static void arena_init(struct arena * const a, const size_t itemsz) 00863 { 00864 a->tail = arena_seg_init(NULL, itemsz); 00865 a->head = a->tail->mem; 00866 a->itemsz = itemsz; 00867 a->nseg = 1; 00868 } 00869 00870 static void arena_destroy(struct arena * const a) 00871 { 00872 struct arena_seg *seg = a->tail; 00873 while (seg != NULL) { 00874 struct arena_seg *prev = seg->prev; 00875 if (munmap(seg->mem, ARENA_SEGSZ) != 0) { 00876 err("munmap() failed"); 00877 } 00878 seg->prev = NULL; 00879 seg->mem = NULL; 00880 free(seg); 00881 seg = prev; 00882 } 00883 a->tail = NULL; 00884 a->head = NULL; 00885 a->nseg = 0; 00886 } 00887 00888 HTM_INLINE void * arena_alloc(struct arena * const a) 00889 { 00890 void *item; 00891 if (a->head == NULL) { 00892 a->tail = arena_seg_init(a->tail, a->itemsz); 00893 a->head = a->tail->mem; 00894 ++a->nseg; 00895 } 00896 item = a->head; 00897 a->head = *((void **) item); 00898 return item; 00899 } 00900 00901 HTM_INLINE void arena_free(struct arena * const a, void * const n) 00902 { 00903 *((void **) n) = a->head; 00904 a->head = n; 00905 } 00906 00907 #endif /* FAST_ALLOC */ 00908 00909 00910 /* ================================================================ */ 00911 /* Phase 1: Produce data file from ASCII inputs */ 00912 /* ================================================================ */ 00913 00914 /* An entry in an HTM tree. 00915 */ 00916 struct tree_entry { 00917 int64_t htmid; 00918 int64_t rowid; 00919 struct htm_sc sc; 00920 } HTM_ALIGNED(16); 00921 00922 /* Tests whether tree entry e1 is less than e2; this is 00923 the same as testing whether the HTM ID of e1 is less than e2. 00924 Row IDs are used to break ties. 00925 */ 00926 HTM_INLINE int tree_entry_lt(const struct tree_entry *e1, 00927 const struct tree_entry *e2) 00928 { 00929 return (e1->htmid < e2->htmid || 00930 (e1->htmid == e2->htmid && e1->rowid < e2->rowid)); 00931 } 00932 00933 /* Returns 1 if the first tree entry is less than the second. 00934 */ 00935 static int tree_entry_cmp(const void *e1, const void *e2) 00936 { 00937 return tree_entry_lt((const struct tree_entry *) e1, 00938 (const struct tree_entry *) e2); 00939 } 00940 00941 /* ---- Quicksort of tree entries ---- */ 00942 00943 static void tree_entry_isort(struct tree_entry *entries, size_t n) 00944 { 00945 size_t i, j, k; 00946 for (i = 0; i < n; ++i) { 00947 k = i; 00948 for (j = i + 1; j < n; ++j) { 00949 if (tree_entry_lt(&entries[j], &entries[k])) { 00950 k = j; 00951 } 00952 } 00953 if (k != i) { 00954 struct tree_entry tmp = entries[k]; 00955 entries[k] = entries[i]; 00956 entries[i] = tmp; 00957 } 00958 } 00959 } 00960 00961 static void tree_entry_qsort(struct tree_entry *entries, 00962 size_t left, 00963 size_t right) 00964 { 00965 struct tree_entry pivot; 00966 size_t l, r, mid; 00967 00968 while (1) { 00969 00970 if (right <= left + 8) { 00971 tree_entry_isort(entries + left, right - left + 1); 00972 return; 00973 } 00974 00975 /* find median-of-3 */ 00976 mid = left + ((right - left) >> 1); 00977 if (tree_entry_lt(entries + left, entries + mid)) { 00978 if (tree_entry_lt(entries + right, entries + left)) { 00979 mid = left; /* right, left, mid */ 00980 } else if (tree_entry_lt(entries + right, entries + mid)) { 00981 mid = right; /* left, right, mid */ 00982 } /* left, mid, right */ 00983 } else { 00984 if (tree_entry_lt(entries + left, entries + right)) { 00985 mid = left; /* mid, left, right */ 00986 } else if (tree_entry_lt(entries + mid, entries + right)) { 00987 mid = right; /* mid, right, left */ 00988 } /* right, mid, left */ 00989 } 00990 00991 /* set pivot to median-of-3 and store at left-most array location */ 00992 pivot = entries[mid]; 00993 if (mid != left) { 00994 entries[mid] = entries[left]; 00995 entries[left] = pivot; 00996 } 00997 00998 /* partition around pivot */ 00999 l = left; 01000 r = right; 01001 do { 01002 while (tree_entry_lt(&pivot, &entries[right])) { 01003 --right; 01004 if (left >= right) { 01005 goto end; 01006 } 01007 } 01008 entries[left] = entries[right]; 01009 do { 01010 ++left; 01011 if (left >= right) { 01012 left = right; 01013 goto end; 01014 } 01015 } while (tree_entry_lt(&entries[left], &pivot)); 01016 entries[right] = entries[left]; 01017 --right; 01018 } while (left < right); 01019 end: 01020 entries[left] = pivot; 01021 01022 /* recurse on smaller partition */ 01023 if (2*left <= r + l) { 01024 if (l < left) { 01025 tree_entry_qsort(entries, l, left - 1); 01026 } 01027 ++left; 01028 right = r; 01029 } else { 01030 if (r > left) { 01031 tree_entry_qsort(entries, left + 1, r); 01032 } 01033 right = left - 1; 01034 left = l; 01035 } 01036 /* process larger partition without recursing to limit stack usage */ 01037 } 01038 } 01039 01040 01041 /* Sorts a memory block containing tree entries. 01042 */ 01043 static void tree_entry_sort(void *data, size_t nbytes) 01044 { 01045 size_t n; 01046 assert(((size_t) data) % 16 == 0 && 01047 "block pointer not aligned to a multiple of 16 bytes"); 01048 assert(nbytes % sizeof(struct tree_entry) == 0 && 01049 "block size not a multiple of sizeof(struct tree_entry)"); 01050 n = nbytes / sizeof(struct tree_entry); 01051 if (n > 1) { 01052 tree_entry_qsort((struct tree_entry *) data, 0, n - 1); 01053 } 01054 } 01055 01056 01057 /* ---- Convert text inputs to a block-sorted binary tree entry file ---- */ 01058 01059 static char * eat_delim(char *s, char delim, const char *fname, size_t lineno) 01060 { 01061 for (; isspace(*s) && *s != delim; ++s) { } 01062 if (*s != delim) { 01063 err("[%s:%llu] - invalid/truncated record", 01064 fname, (unsigned long long) lineno); 01065 } 01066 return s + 1; 01067 } 01068 01069 static char * eat_ws(char *s, char delim, const char *fname, size_t lineno) 01070 { 01071 for (; isspace(*s) && *s != delim; ++s) { } 01072 if (*s == delim || *s == '\0') { 01073 err("[%s:%llu] - invalid/truncated record", 01074 fname, (unsigned long long) lineno); 01075 } 01076 return s; 01077 } 01078 01079 01080 /* Converts ASCII input files to a block-sorted binary file. 01081 */ 01082 static size_t blk_sort_ascii(char **infile, 01083 const int nfile, 01084 const char *outfile, 01085 const char delim, 01086 const struct mem_params * const mem) 01087 { 01088 char line[16384]; 01089 size_t nentries; 01090 struct blk_writer *out; 01091 double ttot; 01092 int i; 01093 01094 if (mem->sortsz % sizeof(struct tree_entry) != 0) { 01095 err("Write block size is not a multiple of " 01096 "sizeof(struct tree_entry)"); 01097 } 01098 msg("Creating block-sorted tree entry file %s from ASCII file(s)\n", 01099 outfile); 01100 ttot = now(); 01101 nentries = 0; 01102 out = blk_writer_init(outfile, mem->sortsz); 01103 01104 /* For each input file... */ 01105 for (i = 0; i < nfile; ++i) { 01106 FILE *f; 01107 size_t lineno; 01108 struct tree_entry entry; 01109 struct htm_v3 v; 01110 double t; 01111 01112 msg("\t- processing %s ... ", infile[i]); 01113 t = now(); 01114 01115 /* open input file */ 01116 f = fopen(infile[i], "r"); 01117 if (f == NULL) { 01118 err("Failed to open file %s for reading", infile[i]); 01119 } 01120 for (lineno = 1; fgets(line, sizeof(line), f) != NULL; ++lineno) { 01121 char *s, *endptr; 01122 double lon, lat; 01123 size_t len = strlen(line); 01124 01125 if (line[len - 1] != '\n') { 01126 if (feof(f) == 0) { 01127 err("Line %llu of file %s is too long (> %d characters)", 01128 (unsigned long long) lineno, infile[i], 01129 (int) sizeof(line)); 01130 } 01131 } 01132 s = eat_ws(line, delim, infile[i], lineno); 01133 entry.rowid = (int64_t) strtoll(s, &endptr, 0); 01134 if (endptr == s || endptr == NULL || errno != 0) { 01135 err("[%s:%llu] - failed to convert row_id to an integer", 01136 infile[i], (unsigned long long) lineno); 01137 } 01138 s = eat_delim(endptr, delim, infile[i], lineno); 01139 s = eat_ws(s, delim, infile[i], lineno); 01140 lon = strtod(s, &endptr); 01141 if (endptr == s || endptr == NULL || errno != 0) { 01142 err("[%s:%llu] - failed to convert right ascension/longitude " 01143 "to a double", infile[i], (unsigned long long) lineno); 01144 } 01145 s = eat_delim(endptr, delim, infile[i], lineno); 01146 s = eat_ws(s, delim, infile[i], lineno); 01147 lat = strtod(s, &endptr); 01148 if (endptr == s || endptr == NULL || errno != 0) { 01149 err("[%s:%llu] - failed to convert declination/latitude " 01150 "to a double", infile[i], (unsigned long long) lineno); 01151 } 01152 s = endptr; 01153 if (*s != delim && *s != '\0' && !isspace(*s)) { 01154 err("[%s:%llu] - invalid record", 01155 infile[i], (unsigned long long) lineno); 01156 } 01157 /* compute and store tree_entry for line */ 01158 if (htm_sc_init(&entry.sc, lon, lat) != HTM_OK) { 01159 err("[%s:%llu] - invalid spherical coordinates", 01160 infile[i], (unsigned long long) lineno); 01161 } 01162 if (htm_sc_tov3(&v, &entry.sc) != HTM_OK) { 01163 err("[%s:%llu] - failed to convert spherical coordinates " 01164 "to a unit vector", infile[i], (unsigned long long) lineno); 01165 } 01166 entry.htmid = htm_v3_id(&v, 20); 01167 if (entry.htmid == 0) { 01168 err("[%s:%llu] - failed to compute HTM ID for spherical " 01169 "coordinates", infile[i], (unsigned long long) lineno); 01170 } 01171 blk_writer_append(out, &entry, sizeof(struct tree_entry), 01172 &tree_entry_sort); 01173 } 01174 if (ferror(f) != 0) { 01175 err("failed to read file %s", infile[i]); 01176 } 01177 if (fclose(f) != 0) { 01178 err("failed to close file %s", infile[i]); 01179 } 01180 nentries += lineno - 1; 01181 msg("%llu records in %.3f sec\n", 01182 (unsigned long long) lineno, now() - t); 01183 /* advance to next input file */ 01184 } 01185 01186 /* flush and close block writer */ 01187 blk_writer_close(out, &tree_entry_sort); 01188 msg("\t%.3f sec for %llu records total\n\n", 01189 now() - ttot, (unsigned long long) nentries); 01190 return nentries; 01191 } 01192 01193 01194 /* ================================================================ */ 01195 /* Phase 2: Tree generation and layout */ 01196 /* ================================================================ */ 01197 01198 /* Number of levels-of-detail used by Split-and-Refine. 01199 */ 01200 #define NLOD 5 01201 01202 /* Tree layout block sizes in bytes, from largest to smallest. 01203 */ 01204 static const uint32_t layout_size[NLOD] = { 01205 2097152, /* 2MiB: large page size for modern x86 processors. */ 01206 65536, /* Between small and large page size. Chosen in hopes 01207 of improving OS disk prefetch effectiveness. */ 01208 4096, /* 4KiB: default page size for modern x86 processors */ 01209 256, /* Between cache line and small page sizes. Chosen in hopes 01210 of improving HW cache-line prefetch effectiveness. */ 01211 64 /* L1/L2/L3 cache-line size for modern x86 processors. */ 01212 }; 01213 01214 /* A node ID, consisting of NLOD block IDs (one per layout block size) 01215 and the index of the node in a post-order traversal. This last index 01216 makes node IDs unique (more than one node might fit in a block at 01217 the finest LOD). 01218 */ 01219 struct node_id { 01220 uint64_t block[NLOD + 1]; 01221 } HTM_ALIGNED(16); 01222 01223 /* On-disk representation of a tree node. 01224 */ 01225 struct disk_node { 01226 struct node_id id; 01227 uint64_t count; 01228 uint64_t index; 01229 struct node_id child[4]; 01230 } HTM_ALIGNED(16); 01231 01232 /* Returns 1 if the given node ID corresponds to an empty child 01233 (all block IDs are zero). 01234 */ 01235 HTM_INLINE int node_empty(const struct node_id * const id) 01236 { 01237 int i; 01238 for (i = 0; i < NLOD + 1; ++i) { 01239 if (id->block[i] != 0) { 01240 return 0; 01241 } 01242 } 01243 return 1; 01244 } 01245 01246 /* Returns 1 if the given node IDs are equal. 01247 */ 01248 HTM_INLINE int node_id_eq(const struct node_id * const id1, 01249 const struct node_id * const id2) 01250 { 01251 int i; 01252 for (i = 0; i < NLOD + 1; ++i) { 01253 if (id1->block[i] != id2->block[i]) { 01254 return 0; 01255 } 01256 } 01257 return 1; 01258 } 01259 01260 /* Returns 1 if the first node ID is less than the second. We say that 01261 a node N1 is less than N2 if the string of block IDs for N1 is 01262 lexicographically less than the string for N2. 01263 */ 01264 HTM_INLINE int node_id_lt(const struct node_id * const id1, 01265 const struct node_id * const id2) 01266 { 01267 int i; 01268 for (i = 0; i < NLOD + 1; ++i) { 01269 if (id1->block[i] < id2->block[i]) { 01270 return 1; 01271 } else if (id1->block[i] > id2->block[i]) { 01272 break; 01273 } 01274 } 01275 return 0; 01276 } 01277 01278 /* Compares nodes by ID. 01279 */ 01280 HTM_INLINE int disk_node_lt(const struct disk_node *n1, 01281 const struct disk_node *n2) 01282 { 01283 return node_id_lt(&n1->id, &n2->id); 01284 } 01285 01286 static int disk_node_cmp(const void *n1, const void * n2) 01287 { 01288 return disk_node_lt((const struct disk_node *) n1, 01289 (const struct disk_node *) n2); 01290 } 01291 01292 01293 /* ---- Quicksort of disk nodes ---- */ 01294 01295 static void disk_node_isort(struct disk_node *nodes, size_t n) 01296 { 01297 size_t i, j, k; 01298 for (i = 0; i < n; ++i) { 01299 k = i; 01300 for (j = i + 1; j < n; ++j) { 01301 if (disk_node_lt(&nodes[j], &nodes[k])) { 01302 k = j; 01303 } 01304 } 01305 if (k != i) { 01306 struct disk_node tmp = nodes[k]; 01307 nodes[k] = nodes[i]; 01308 nodes[i] = tmp; 01309 } 01310 } 01311 } 01312 01313 static void disk_node_qsort(struct disk_node *nodes, size_t left, size_t right) 01314 { 01315 struct disk_node pivot; 01316 size_t l, r, mid; 01317 01318 while (1) { 01319 01320 if (right <= left + 8) { 01321 disk_node_isort(nodes + left, right - left + 1); 01322 return; 01323 } 01324 01325 /* find median-of-3 */ 01326 mid = left + ((right - left) >> 1); 01327 if (disk_node_lt(nodes + left, nodes + mid)) { 01328 if (disk_node_lt(nodes + right, nodes + left)) { 01329 mid = left; /* right, left, mid */ 01330 } else if (disk_node_lt(nodes + right, nodes + mid)) { 01331 mid = right; /* left, right, mid */ 01332 } /* left, mid, right */ 01333 } else { 01334 if (disk_node_lt(nodes + left, nodes + right)) { 01335 mid = left; /* mid, left, right */ 01336 } else if (disk_node_lt(nodes + mid, nodes + right)) { 01337 mid = right; /* mid, right, left */ 01338 } /* right, mid, left */ 01339 } 01340 01341 /* set pivot to median-of-3 and store at left-most array location */ 01342 pivot = nodes[mid]; 01343 if (mid != left) { 01344 nodes[mid] = nodes[left]; 01345 nodes[left] = pivot; 01346 } 01347 01348 /* partition around pivot */ 01349 l = left; 01350 r = right; 01351 do { 01352 while (disk_node_lt(&pivot, &nodes[right])) { 01353 --right; 01354 if (left >= right) { 01355 goto end; 01356 } 01357 } 01358 nodes[left] = nodes[right]; 01359 do { 01360 ++left; 01361 if (left >= right) { 01362 left = right; 01363 goto end; 01364 } 01365 } while (disk_node_lt(&nodes[left], &pivot)); 01366 nodes[right] = nodes[left]; 01367 --right; 01368 } while (left < right); 01369 end: 01370 nodes[left] = pivot; 01371 01372 /* recurse on smaller partition */ 01373 if (2*left <= r + l) { 01374 if (l < left) { 01375 disk_node_qsort(nodes, l, left - 1); 01376 } 01377 ++left; 01378 right = r; 01379 } else { 01380 if (r > left) { 01381 disk_node_qsort(nodes, left + 1, r); 01382 } 01383 right = left - 1; 01384 left = l; 01385 } 01386 /* process larger partition without recursing to limit stack usage */ 01387 } 01388 } 01389 01390 01391 /* Sorts a memory block containing disk nodes. 01392 */ 01393 static void disk_node_sort(void *data, size_t nbytes) 01394 { 01395 size_t n; 01396 assert(((size_t) data) % 16 == 0 && 01397 "block pointer not aligned to a multiple of 16 bytes"); 01398 assert(nbytes % sizeof(struct disk_node) == 0 && 01399 "block size not a multiple of sizeof(struct disk_node)"); 01400 n = nbytes / sizeof(struct disk_node); 01401 if (n > 1) { 01402 disk_node_qsort((struct disk_node *) data, 0, n - 1); 01403 } 01404 } 01405 01406 01407 /* ---- In-memory node representation ---- */ 01408 01409 /* Status of an in-memory tree node. 01410 */ 01411 enum node_status { 01412 NODE_INIT = 0, /* node is minty fresh */ 01413 NODE_EMITTED, /* node has been processed by emit_node() */ 01414 NODE_LAID_OUT, /* node has been processed by layout_node() */ 01415 }; 01416 01417 /* In-memory representation of a tree node. Note that block size/depth 01418 is packed into a single 32 bit integer; as a result, nodes occupy 01419 exactly 256 bytes and are nicely cache aligned. 01420 */ 01421 struct mem_node { 01422 struct node_id id; /* Hierarchical ID for node */ 01423 int64_t htmid; /* HTM ID of node */ 01424 uint64_t index; /* File offset of first point in node */ 01425 uint64_t count; /* Number of points in node */ 01426 enum node_status status; 01427 uint32_t blockinfo[NLOD]; /* Clark & Munro: block depth (8 MSBs) and 01428 block size (24 LSBs) for each LOD. */ 01429 struct mem_node *child[4]; 01430 } HTM_ALIGNED(16); 01431 01432 01433 /* get/set block size/depth from 32 bit blockinfo */ 01434 HTM_INLINE uint32_t get_block_size(uint32_t blockinfo) { 01435 return blockinfo & 0xffffff; 01436 } 01437 HTM_INLINE uint8_t get_block_depth(uint32_t blockinfo) { 01438 return blockinfo >> 24; 01439 } 01440 HTM_INLINE uint32_t make_block_info(uint32_t size, uint8_t depth) { 01441 return (size & 0xffffff) | (((uint32_t) depth) << 24); 01442 } 01443 01444 01445 /* ---- Tree generation ---- */ 01446 01447 /* Tree generation context. 01448 */ 01449 struct tree_gen_context { 01450 #if FAST_ALLOC 01451 struct arena ar; /* node memory allocator */ 01452 #endif 01453 size_t nnodes; /* number of nodes in the tree */ 01454 uint64_t leafthresh; /* maximum # of points per leaf */ 01455 uint64_t poidx; /* next post-order tree traversal index */ 01456 uint64_t blockid[NLOD]; /* index of next block ID to assign for each LOD */ 01457 struct blk_writer *wr; /* node writer */ 01458 }; 01459 01460 static void tree_gen_context_init(struct tree_gen_context * const ctx, 01461 const uint64_t leafthresh, 01462 const char * const file, 01463 const size_t blksz) 01464 { 01465 int i; 01466 ctx->nnodes = 0; 01467 ctx->leafthresh = leafthresh; 01468 ctx->poidx = 0; 01469 for (i = 0; i < NLOD; ++i) { 01470 ctx->blockid[i] = 0; 01471 } 01472 #if FAST_ALLOC 01473 arena_init(&ctx->ar, sizeof(struct mem_node)); 01474 #endif 01475 ctx->wr = blk_writer_init(file, blksz); 01476 } 01477 01478 static void tree_gen_context_destroy(struct tree_gen_context * const ctx) 01479 { 01480 #if FAST_ALLOC 01481 arena_destroy(&ctx->ar); 01482 #endif 01483 blk_writer_close(ctx->wr, &disk_node_sort); 01484 ctx->wr = NULL; 01485 } 01486 01487 01488 /* Assigns a block ID at the specified level-of-detail to all nodes in 01489 the sub-tree rooted at n that do not already have a block ID at that 01490 level-of-detail. When assigning a block ID to a node, check whether all 01491 block IDs are now valid and if so write the node out to disk. 01492 Children of nodes that are written are destroyed. 01493 */ 01494 static void assign_block(struct tree_gen_context * const ctx, 01495 struct mem_node * const n, 01496 const uint64_t blockid, 01497 const int lod) 01498 { 01499 int i; 01500 if (n->id.block[lod] != 0) { 01501 return; 01502 } 01503 /* visit children */ 01504 for (i = 0; i < 4; ++i) { 01505 if (n->child[i] != NULL) { 01506 assign_block(ctx, n->child[i], blockid, lod); 01507 } 01508 } 01509 n->id.block[lod] = blockid; 01510 for (i = 0; i < NLOD; ++i) { 01511 if (n->id.block[i] == 0) { 01512 return; 01513 } 01514 } 01515 /* write node to disk */ 01516 { 01517 struct disk_node d; 01518 d.id = n->id; 01519 d.count = n->count; 01520 d.index = n->index; 01521 for (i = 0; i < 4; ++i) { 01522 struct mem_node *tmp = n->child[i]; 01523 if (tmp != NULL) { 01524 /* copy id of child to disk node, throw away child */ 01525 d.child[i] = tmp->id; 01526 #if FAST_ALLOC 01527 arena_free(&ctx->ar, tmp); 01528 #else 01529 free(tmp); 01530 #endif 01531 n->child[i] = NULL; 01532 } else { 01533 memset(&d.child[i], 0, sizeof(struct node_id)); 01534 } 01535 } 01536 blk_writer_append(ctx->wr, &d, sizeof(struct disk_node), &disk_node_sort); 01537 ++ctx->nnodes; 01538 } 01539 } 01540 01541 01542 /* Estimates the compressed on-disk size of a tree node. 01543 */ 01544 static uint32_t estimate_node_size(const struct mem_node * const node, 01545 const uint32_t nchild) 01546 { 01547 uint32_t sz = htm_varint_len(node->index) + htm_varint_len(node->count); 01548 if (nchild > 0) { 01549 /* There is no way to compute size of a child offset accurately 01550 without knowing the final node layout, so use a guess of 4 bytes 01551 per non-empty child. The offset for an empty child (0) will 01552 will occupy exactly 1 byte, regardless of layout. */ 01553 sz += nchild*3 + 4; 01554 } 01555 return sz; 01556 } 01557 01558 struct child_info { 01559 struct mem_node *node; 01560 uint32_t size; 01561 uint8_t depth; 01562 int8_t idx; 01563 }; 01564 01565 /* Insertion sort of (at most 8) child_info structs; they are sorted 01566 by block depth and size. 01567 */ 01568 static void child_info_isort(struct child_info * const c, const int n) 01569 { 01570 if (n < 2) { 01571 return; 01572 } else { 01573 int i, j, k; 01574 for (i = 0; i < n; ++i) { 01575 k = i; 01576 for (j = i + 1; j < n; ++j) { 01577 if (c[j].depth < c[k].depth || 01578 (c[j].depth == c[k].depth && c[j].size < c[k].size)) { 01579 k = j; 01580 } 01581 } 01582 if (k != i) { 01583 struct child_info tmp = c[k]; 01584 c[k] = c[i]; 01585 c[i] = tmp; 01586 } 01587 } 01588 } 01589 } 01590 01591 static void layout_node(struct mem_node * const node, 01592 struct tree_gen_context * const ctx) 01593 { 01594 struct child_info cinfo[4]; 01595 int c, nchild; 01596 01597 if (node->status > NODE_EMITTED) { 01598 return; 01599 } 01600 01601 /* visit children */ 01602 for (c = 0, nchild = 0; c < 4; ++c) { 01603 if (node->child[c] != NULL) { 01604 layout_node(node->child[c], ctx); 01605 cinfo[nchild].node = node->child[c]; 01606 cinfo[nchild].idx = c; 01607 ++nchild; 01608 } 01609 } 01610 01611 /* update status and assign post-order index. */ 01612 node->status = NODE_LAID_OUT; 01613 node->id.block[5] = ++ctx->poidx; 01614 01615 /* Clark & Munro at every level-of-detail */ 01616 if (nchild == 0) { 01617 /* leaf */ 01618 int lod; 01619 const uint32_t nodesz = estimate_node_size(node, nchild); 01620 const uint32_t info = make_block_info(nodesz, 1u); 01621 for (lod = 0; lod < NLOD; ++lod) { 01622 node->blockinfo[lod] = info; 01623 if (nodesz > layout_size[lod]) { 01624 uint64_t blockid = ++ctx->blockid[lod]; 01625 assign_block(ctx, node, blockid, lod); 01626 } 01627 } 01628 } else { 01629 /* internal node */ 01630 int lod; 01631 const uint32_t nodesz = estimate_node_size(node, nchild); 01632 for (lod = 0; lod < NLOD; ++lod) { 01633 uint64_t blockid; 01634 uint32_t totsz = nodesz; 01635 int close = 0, endclose = nchild; 01636 01637 for (c = 0; c < nchild; ++c) { 01638 uint32_t s; 01639 struct mem_node *tmp = cinfo[c].node; 01640 s = get_block_size(tmp->blockinfo[lod]); 01641 cinfo[c].size = s; 01642 cinfo[c].depth = get_block_depth(tmp->blockinfo[lod]); 01643 totsz += s; 01644 } 01645 child_info_isort(cinfo, nchild); 01646 01647 if (cinfo[0].depth == cinfo[nchild - 1].depth) { 01648 /* all children have the same block depth */ 01649 if (totsz <= layout_size[lod]) { 01650 /* children and parent all fit in one block; merge them */ 01651 node->blockinfo[lod] = make_block_info(totsz, cinfo[0].depth); 01652 continue; 01653 } else { 01654 /* cannot fit family into one block: scan children 01655 from smallest to largest, placing as many as possible 01656 in the same block as the parent. */ 01657 totsz = nodesz; 01658 for (close = 0; close < nchild - 1; ++close) { 01659 if (totsz + cinfo[close].size > layout_size[lod]) { 01660 break; 01661 } 01662 totsz += cinfo[close].size; 01663 } 01664 /* increase block depth of parent by 1 */ 01665 node->blockinfo[lod] = make_block_info(totsz, cinfo[0].depth + 1); 01666 } 01667 } else { 01668 /* nchild > 1, not all children have the same block depth */ 01669 totsz = nodesz; 01670 for (endclose = nchild - 1; endclose > 0; --endclose) { 01671 totsz += cinfo[endclose].size; 01672 if (cinfo[endclose - 1].depth != cinfo[nchild - 1].depth) { 01673 break; 01674 } 01675 } 01676 if (totsz < layout_size[lod]) { 01677 /* merge parent and largest-depth children */ 01678 node->blockinfo[lod] = make_block_info(totsz, cinfo[nchild - 1].depth); 01679 } else { 01680 /* fresh block for parent, increase block depth by 1 */ 01681 node->blockinfo[lod] = make_block_info(nodesz, cinfo[nchild - 1].depth + 1); 01682 endclose = nchild; 01683 } 01684 } 01685 /* scan remaining children from smallest to largest, merging 01686 runs of children into a single block where possible. */ 01687 totsz = cinfo[close].size; 01688 for (c = close + 1; c < endclose; ++c) { 01689 if (totsz + cinfo[c].size > layout_size[lod]) { 01690 blockid = ++ctx->blockid[lod]; 01691 for (; close < c; ++close) { 01692 assign_block(ctx, cinfo[close].node, blockid, lod); 01693 } 01694 totsz = cinfo[c].size; 01695 } else { 01696 totsz += cinfo[c].size; 01697 } 01698 } 01699 blockid = ++ctx->blockid[lod]; 01700 for (; close < endclose; ++close) { 01701 assign_block(ctx, cinfo[close].node, blockid, lod); 01702 } 01703 } 01704 } 01705 } 01706 01707 /* Called on a node when all points belonging to it have been accounted for. 01708 */ 01709 static void emit_node(struct mem_node * const node, 01710 struct tree_gen_context * const ctx) 01711 { 01712 int c; 01713 if (node->status > NODE_INIT) { 01714 return; 01715 } 01716 /* visit children */ 01717 for (c = 0; c < 4; ++c) { 01718 if (node->child[c] != NULL) { 01719 emit_node(node->child[c], ctx); 01720 } 01721 } 01722 if (node->count < ctx->leafthresh) { 01723 /* if the node point count is too small, 01724 make it a leaf by deleting all children. */ 01725 for (c = 0; c < 4; ++c) { 01726 if (node->child[c] != NULL) { 01727 #if FAST_ALLOC 01728 arena_free(&ctx->ar, node->child[c]); 01729 #else 01730 free(node->child[c]); 01731 #endif 01732 node->child[c] = NULL; 01733 } 01734 } 01735 node->status = NODE_EMITTED; 01736 } else { 01737 /* otherwise, layout the subtree rooted at node */ 01738 layout_node(node, ctx); 01739 } 01740 } 01741 01742 /* Adds a new node to root (one of S0,S1,S2,S3,N0,N1,N2,N3). 01743 */ 01744 static void add_node(struct mem_node * const root, 01745 struct tree_gen_context * const ctx, 01746 int64_t htmid, 01747 int64_t count, 01748 int64_t index) 01749 { 01750 struct mem_node *node; 01751 int lvl = 0; 01752 01753 for (lvl = 0, node = root; lvl < 20; ++lvl) { 01754 /* keep subdividing */ 01755 int i, c; 01756 node->count += count; 01757 c = (htmid >> 2*(19 - lvl)) & 3; 01758 for (i = 0; i < c; ++i) { 01759 struct mem_node *tmp = node->child[i]; 01760 if (tmp != NULL && tmp->status == NODE_INIT) { 01761 emit_node(node->child[i], ctx); 01762 } 01763 } 01764 index -= node->index; /* relativize index */ 01765 if (node->child[c] != NULL) { 01766 node = node->child[c]; 01767 } else { 01768 /* create child node */ 01769 #if FAST_ALLOC 01770 struct mem_node *child = (struct mem_node *) arena_alloc(&ctx->ar); 01771 #else 01772 struct mem_node *child = 01773 (struct mem_node *) malloc(sizeof(struct mem_node)); 01774 if (child == NULL) { 01775 err("malloc() failed"); 01776 } 01777 #endif 01778 memset(child, 0, sizeof(struct mem_node)); 01779 child->index = index; 01780 child->htmid = node->htmid*4 + c; 01781 node->child[c] = child; 01782 node = child; 01783 } 01784 } 01785 assert(node->htmid == htmid); 01786 node->count = count; 01787 } 01788 01789 01790 /* Container for the 8 level-0 HTM tree nodes. 01791 */ 01792 struct tree_root { 01793 uint64_t count; /* Total number of points in tree */ 01794 struct mem_node *child[8]; 01795 struct node_id childid[8]; 01796 }; 01797 01798 01799 /* Assigns block IDs to the HTM root nodes. 01800 */ 01801 static void finish_root(struct tree_root * const super, 01802 struct tree_gen_context * const ctx) 01803 { 01804 struct child_info cinfo[8]; 01805 int c, close, nchild, lod; 01806 01807 for (c = 0, nchild = 0; c < 8; ++c) { 01808 struct mem_node *tmp = super->child[c]; 01809 if (tmp != NULL) { 01810 super->count += tmp->count; 01811 cinfo[nchild].node = tmp; 01812 cinfo[nchild].idx = c; 01813 ++nchild; 01814 } 01815 } 01816 /* Clark & Munro for each level of detail */ 01817 for (lod = 0; lod < NLOD; ++lod) { 01818 uint64_t blockid; 01819 uint32_t totsz; 01820 for (c = 0; c < nchild; ++c) { 01821 struct mem_node *tmp = cinfo[c].node; 01822 cinfo[c].size = get_block_size(tmp->blockinfo[lod]); 01823 cinfo[c].depth = get_block_depth(tmp->blockinfo[lod]); 01824 } 01825 child_info_isort(cinfo, nchild); 01826 /* scan children from smallest to largest, merging 01827 as many as possible into blocks. */ 01828 close = 0; 01829 totsz = cinfo[0].size; 01830 for (c = 1; c < nchild; ++c) { 01831 if (totsz + cinfo[c].size > layout_size[lod]) { 01832 blockid = ++ctx->blockid[lod]; 01833 for (; close < c; ++close) { 01834 assign_block(ctx, cinfo[close].node, blockid, lod); 01835 } 01836 totsz = cinfo[c].size; 01837 } 01838 } 01839 blockid = ++ctx->blockid[lod]; 01840 for (; close < nchild; ++close) { 01841 assign_block(ctx, cinfo[close].node, blockid, lod); 01842 } 01843 } 01844 /* At this point, all nodes are guaranteed to have been written to disk. 01845 Copy HTM root node IDs to the super root and then throw them away. */ 01846 for (c = 0; c < nchild; ++c) { 01847 super->childid[cinfo[c].idx] = cinfo[c].node->id; 01848 #if FAST_ALLOC 01849 arena_free(&ctx->ar, cinfo[c].node); 01850 #else 01851 free(cinfo[c].node); 01852 #endif 01853 super->child[cinfo[c].idx] = NULL; 01854 } 01855 } 01856 01857 01858 /* Tree generation driver function. 01859 */ 01860 static size_t tree_gen(const char * const datafile, 01861 const char * const treefile, 01862 const struct mem_params * const mem, 01863 struct tree_root * const super, 01864 const uint64_t leafthresh, 01865 const size_t npoints) 01866 { 01867 struct tree_gen_context ctx; 01868 const struct tree_entry *data; 01869 void *behind; 01870 int64_t htmid; 01871 uint64_t index, count; 01872 size_t i, nseg; 01873 int r, fd; 01874 const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE); 01875 size_t mapsz = npoints * sizeof(struct tree_entry); 01876 double t; 01877 01878 if (npoints == 0) { 01879 err("no input points"); 01880 } 01881 msg("Generating block sorted tree node file %s from %s\n", 01882 treefile, datafile); 01883 t = now(); 01884 fd = open(datafile, O_RDONLY); 01885 if (fd == -1) { 01886 err("failed to open file %s for reading", datafile); 01887 } 01888 if (mapsz % pagesz != 0) { 01889 mapsz += pagesz - mapsz % pagesz; 01890 } 01891 behind = mmap(NULL, mapsz, PROT_READ, MAP_SHARED | MAP_NORESERVE, fd, 0); 01892 if (behind == MAP_FAILED) { 01893 err("mmap() file %s for reading", datafile); 01894 } 01895 if (madvise(behind, mapsz, MADV_SEQUENTIAL) != 0) { 01896 err("madvise() failed on mmap for file %s", datafile); 01897 } 01898 data = (const struct tree_entry *) behind; 01899 behind = ((unsigned char *) behind) + mem->ioblksz; 01900 tree_gen_context_init(&ctx, leafthresh, treefile, mem->sortsz); 01901 memset(super, 0, sizeof(struct tree_root)); 01902 01903 /* walk over tree entries, adding tree nodes. */ 01904 if (data[0].htmid == 0) { 01905 err("invalid HTM ID"); 01906 } 01907 htmid = 0; 01908 index = 0; 01909 count = 0; 01910 r = -1; 01911 for (i = 0; i < npoints; ++i) { 01912 if ((void *) &data[i] > behind) { 01913 void *ptr = ((unsigned char *) behind) - mem->ioblksz; 01914 if (madvise(ptr, mem->ioblksz, MADV_DONTNEED) != 0) { 01915 err("madvise() failed"); 01916 } 01917 behind = ((unsigned char *) behind) + mem->ioblksz; 01918 } 01919 if (data[i].htmid == htmid) { 01920 /* increase point count for the current htmid */ 01921 ++count; 01922 } else { 01923 int r2; 01924 assert(data[i].htmid > htmid); 01925 if (r >= 0) { 01926 /* add previous node if there is one */ 01927 add_node(super->child[r], &ctx, htmid, count, (uint64_t) index); 01928 } 01929 /* reset index, count, and htmid */ 01930 count = 1; 01931 index = (uint64_t) i; 01932 htmid = data[i].htmid; 01933 r2 = (int) (htmid >> 40) - 8; 01934 if (r2 < 0 || r2 > 7) { 01935 err("invalid HTM ID"); 01936 } 01937 if (r != r2) { 01938 /* need a new HTM root node */ 01939 if (r >= 0) { 01940 /* emit and layout the previous root if there is one */ 01941 emit_node(super->child[r], &ctx); 01942 layout_node(super->child[r], &ctx); 01943 } 01944 r = r2; 01945 /* create new root */ 01946 #if FAST_ALLOC 01947 super->child[r] = (struct mem_node *) arena_alloc(&ctx.ar); 01948 #else 01949 super->child[r] = 01950 (struct mem_node *) malloc(sizeof(struct mem_node)); 01951 if (super->child[r] == NULL) { 01952 err("malloc() failed"); 01953 } 01954 #endif 01955 memset(super->child[r], 0, sizeof(struct mem_node)); 01956 super->child[r]->htmid = r + 8; 01957 super->child[r]->index = index; 01958 } 01959 } 01960 } 01961 /* add last node, emit and layout last root */ 01962 add_node(super->child[r], &ctx, htmid, count, (uint64_t) index); 01963 emit_node(super->child[r], &ctx); 01964 layout_node(super->child[r], &ctx); 01965 01966 /* assign block IDs to the HTM roots */ 01967 finish_root(super, &ctx); 01968 if (super->count != npoints) { 01969 err("bug in tree generation phase - points not all accounted for"); 01970 } 01971 /* cleanup */ 01972 i = ctx.nnodes; 01973 nseg = ctx.ar.nseg; 01974 tree_gen_context_destroy(&ctx); 01975 if (munmap((void *) data, mapsz) != 0) { 01976 err("munmap() failed"); 01977 } 01978 if (close(fd) != 0) { 01979 err("close() failed"); 01980 } 01981 msg("\t%.3f sec total (%llu tree nodes, %llu MiB memory)\n\n", 01982 now() - t, (unsigned long long) i, 01983 (unsigned long long) (nseg*ARENA_SEGSZ)/(1024*1024)); 01984 return i; 01985 } 01986 01987 01988 /* ================================================================ */ 01989 /* Phase 3: Tree compression */ 01990 /* ================================================================ */ 01991 01992 /* Mapping from a node ID to a relative file offset. 01993 */ 01994 struct id_off { 01995 struct node_id id; 01996 uint64_t off; 01997 struct id_off *next; 01998 } HTM_ALIGNED(16); 01999 02000 /* Simple hash table that maps node IDs to relative file offsets. The 02001 implementation uses a power-of-2 sized backing array and chains on 02002 collision. The hash-code of a node ID is taken to be its post-order 02003 index (which is unique). 02004 */ 02005 struct hash_table { 02006 size_t n; 02007 size_t cap; 02008 struct id_off **array; 02009 #if FAST_ALLOC 02010 struct arena ar; 02011 #endif 02012 }; 02013 02014 02015 static void hash_table_init(struct hash_table *ht) 02016 { 02017 ht->cap = 65536; /* must be a power of 2 */ 02018 ht->n = 0; 02019 ht->array = (struct id_off **) malloc(ht->cap * sizeof(struct id_off *)); 02020 if (ht->array == NULL) { 02021 err("malloc() failed"); 02022 } 02023 memset(ht->array, 0, ht->cap * sizeof(struct id_off *)); 02024 #if FAST_ALLOC 02025 arena_init(&ht->ar, sizeof(struct id_off)); 02026 #endif 02027 } 02028 02029 02030 static void hash_table_destroy(struct hash_table *ht) 02031 { 02032 #if FAST_ALLOC 02033 arena_destroy(&ht->ar); 02034 #else 02035 size_t i; 02036 struct id_off *ptr; 02037 for (i = 0; i < ht->cap; ++i) { 02038 ptr = ht->array[i]; 02039 while (ptr != NULL) { 02040 struct id_off *next = ptr->next; 02041 free(ptr); 02042 ptr = next; 02043 } 02044 } 02045 #endif 02046 free(ht->array); 02047 ht->array = NULL; 02048 ht->n = 0; 02049 ht->cap = 0; 02050 } 02051 02052 02053 /* Grows hash table capacity by a factor of two. 02054 */ 02055 static void hash_table_grow(struct hash_table *ht) { 02056 size_t i, cap = ht->cap; 02057 struct id_off **array = ht->array; 02058 02059 ht->cap = 2*cap; 02060 ht->array = (struct id_off **) malloc(ht->cap * sizeof(struct id_off *)); 02061 if (ht->array == NULL) { 02062 err("malloc() failed"); 02063 } 02064 memset(ht->array, 0, ht->cap * sizeof(struct id_off *)); 02065 02066 /* add previous hash table entries */ 02067 for (i = 0; i < cap; ++i) { 02068 struct id_off *e = array[i]; 02069 while (e != NULL) { 02070 struct id_off *next = e->next; 02071 size_t j = (2*cap - 1) & (size_t) e->id.block[NLOD]; 02072 e->next = ht->array[j]; 02073 ht->array[j] = e; 02074 e = next; 02075 } 02076 } 02077 } 02078 02079 02080 /* Adds an id to offset mapping to the given hash table. 02081 */ 02082 static void hash_table_add(struct hash_table * const ht, 02083 const struct node_id * const id, 02084 uint64_t off) 02085 { 02086 struct id_off *e; 02087 size_t i; 02088 if ((ht->n*3)/4 > ht->cap) { 02089 hash_table_grow(ht); 02090 } 02091 i = (ht->cap - 1) & (size_t) id->block[NLOD]; 02092 #if FAST_ALLOC 02093 e = (struct id_off *) arena_alloc(&ht->ar); 02094 #else 02095 e = (struct id_off *) malloc(sizeof(struct id_off)); 02096 if (e == NULL) { 02097 err("malloc() failed"); 02098 } 02099 #endif 02100 e->id = *id; 02101 e->off = off; 02102 e->next = ht->array[i]; 02103 ht->array[i] = e; 02104 ++ht->n; 02105 } 02106 02107 02108 /* Returns the offset of the node with the given ID and removes 02109 the corresponding hash table entry. 02110 */ 02111 static uint64_t hash_table_get(struct hash_table * const ht, 02112 const struct node_id * const id) 02113 { 02114 struct id_off *e, *prev; 02115 const size_t i = (ht->cap - 1) & (size_t) id->block[NLOD]; 02116 prev = NULL; 02117 e = ht->array[i]; 02118 /* child must occur before parent */ 02119 while (1) { 02120 if (e == NULL) { 02121 err("tree generation bug: parent node written before child"); 02122 } 02123 if (node_id_eq(id, &e->id)) { 02124 uint64_t off = e->off; 02125 if (prev == NULL) { 02126 ht->array[i] = e->next; 02127 } else { 02128 prev->next = e->next; 02129 } 02130 #if FAST_ALLOC 02131 arena_free(&ht->ar, e); 02132 #else 02133 free(e); 02134 #endif 02135 --ht->n; 02136 return off; 02137 } 02138 prev = e; 02139 e = e->next; 02140 } 02141 /* never reached */ 02142 } 02143 02144 02145 /* Writes out a tree node (byte reversed). 02146 */ 02147 static uint64_t compress_node(struct hash_table * const ht, 02148 struct blk_writer * const wr, 02149 const struct disk_node * const n, 02150 const uint64_t filesz, 02151 const uint64_t leafthresh) 02152 { 02153 unsigned char buf[48]; 02154 unsigned char *s = buf; 02155 uint64_t sz = filesz; 02156 unsigned int v; 02157 int c, leaf; 02158 02159 /* write out child offsets (from child 3 to child 0) */ 02160 for (c = 3, leaf = 1; c >= 0; --c) { 02161 if (node_empty(&n->child[c])) { 02162 *s = 0; 02163 ++s; 02164 ++sz; 02165 } else { 02166 /* this is tricky - child 3 of n can be laid out immediately after 02167 n, yielding a child offset of 0. But 0 also means "empty child", 02168 so instead encode the actual offset + 1. */ 02169 v = htm_varint_rencode(s, sz + 1 - hash_table_get(ht, &n->child[c])); 02170 s += v; 02171 sz += v; 02172 leaf = 0; 02173 } 02174 } 02175 if (leaf != 0) { 02176 /* n is a leaf: don't store child offsets */ 02177 s -= 4; 02178 sz -= 4; 02179 } else if (n->count < leafthresh) { 02180 err("tree generation bug: internal node contains too few points"); 02181 } 02182 /* write out relative index, then count */ 02183 v = htm_varint_rencode(s, n->index); 02184 s += v; 02185 sz += v; 02186 v = htm_varint_rencode(s, n->count); 02187 s += v; 02188 sz += v; 02189 /* write out byte reversed node, add node id to hashtable */ 02190 blk_writer_append(wr, buf, (size_t) (s - buf), NULL); 02191 hash_table_add(ht, &n->id, sz); 02192 return sz; 02193 } 02194 02195 02196 /* Writes out a tree header (byte reversed). 02197 */ 02198 static uint64_t write_tree_header(struct hash_table * const ht, 02199 struct blk_writer * const wr, 02200 const struct tree_root * const super, 02201 const uint64_t filesz, 02202 const uint64_t leafthresh) 02203 { 02204 unsigned char buf[96]; 02205 unsigned char *s; 02206 uint64_t sz; 02207 int r; 02208 unsigned int v; 02209 02210 /* write offsets of N3, N2, N1, N0, S3, S2, S1, S0 */ 02211 for (r = 7, s = buf, sz = filesz; r >= 0; --r) { 02212 if (node_empty(&super->childid[r])) { 02213 *s = 0; 02214 ++s; 02215 ++sz; 02216 } else { 02217 /* N3 could be laid out immediately after super root, 02218 yielding a child offset of 0, which means "empty child". 02219 Therefore encode 1 + actual offset. */ 02220 v = htm_varint_rencode( 02221 s, sz + 1 - hash_table_get(ht, &super->childid[r])); 02222 s += v; 02223 sz += v; 02224 } 02225 } 02226 /* write total number of points in tree */ 02227 v = htm_varint_rencode(s, super->count); 02228 s += v; 02229 sz += v; 02230 /* write leaf threshold */ 02231 v = htm_varint_rencode(s, leafthresh); 02232 s += v; 02233 sz += v; 02234 blk_writer_append(wr, buf, (size_t) (s - buf), NULL); 02235 if (ht->n != 0) { 02236 err("tree compression bug: node id hash table non-empty"); 02237 } 02238 return sz; 02239 } 02240 02241 02242 /* Tree compression driver function. 02243 */ 02244 static uint64_t tree_compress(const char * const treefile, 02245 const char * const scratchfile, 02246 const struct mem_params * const mem, 02247 const struct tree_root * const super, 02248 const size_t nnodes, 02249 const uint64_t leafthresh) 02250 { 02251 struct hash_table ht; 02252 struct blk_writer *wr; 02253 const struct disk_node *data; 02254 void *behind; 02255 double t; 02256 size_t i; 02257 uint64_t filesz; 02258 int fd; 02259 const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE); 02260 size_t mapsz = nnodes * sizeof(struct disk_node); 02261 02262 if (nnodes == 0) { 02263 err("no input nodes"); 02264 } 02265 t = now(); 02266 msg("Generating reversed compressed tree node file %s from %s\n", 02267 scratchfile, treefile); 02268 fd = open(treefile, O_RDONLY); 02269 if (fd == -1) { 02270 err("failed to open file %s for reading", treefile); 02271 } 02272 if (mapsz % pagesz != 0) { 02273 mapsz += pagesz - mapsz % pagesz; 02274 } 02275 behind = mmap(NULL, mapsz, PROT_READ, MAP_SHARED | MAP_NORESERVE, fd, 0); 02276 if (behind == MAP_FAILED) { 02277 err("mmap() file %s for reading", treefile); 02278 } 02279 if (madvise(behind, mapsz, MADV_SEQUENTIAL) != 0) { 02280 err("madvise() failed on mmap for file %s", treefile); 02281 } 02282 data = (const struct disk_node *) behind; 02283 behind = ((unsigned char *) behind) + mem->ioblksz; 02284 hash_table_init(&ht); 02285 wr = blk_writer_init(scratchfile, mem->ioblksz); 02286 02287 /* write nodes */ 02288 for (i = 0, filesz = 0; i < nnodes; ++i) { 02289 if (i > 0) { 02290 assert(disk_node_lt(&data[i - 1], &data[i]) && 02291 "tree node file not sorted"); 02292 } 02293 if ((void *) &data[i] > behind) { 02294 void *ptr = ((unsigned char *) behind) - mem->ioblksz; 02295 if (madvise(ptr, mem->ioblksz, MADV_DONTNEED) != 0) { 02296 err("madvise() failed"); 02297 } 02298 behind = ((unsigned char *) behind) + mem->ioblksz; 02299 } 02300 filesz = compress_node(&ht, wr, &data[i], filesz, leafthresh); 02301 } 02302 /* and tree header */ 02303 filesz = write_tree_header(&ht, wr, super, filesz, leafthresh); 02304 /* cleanup */ 02305 blk_writer_close(wr, NULL); 02306 hash_table_destroy(&ht); 02307 if (munmap((void *) data, mapsz) != 0) { 02308 err("munmap() failed"); 02309 } 02310 if (close(fd) != 0) { 02311 err("close() failed"); 02312 } 02313 msg("\t%.3f sec total\n\n", now() - t); 02314 return filesz; 02315 } 02316 02317 02318 /* Byte reverses an input file to an output file and removes the input file. 02319 */ 02320 static void reverse_file(const char * const infile, 02321 const char * const outfile, 02322 const struct mem_params * const mem, 02323 const uint64_t filesz) 02324 { 02325 const unsigned char *data; 02326 struct blk_writer *wr; 02327 double t; 02328 uint64_t i, j; 02329 int fd; 02330 const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE); 02331 size_t mapsz = filesz; 02332 02333 if (filesz == 0) { 02334 err("cannot reverse an empty file"); 02335 } 02336 msg("Reversing file %s to produce %s\n", infile, outfile); 02337 t = now(); 02338 fd = open(infile, O_RDONLY); 02339 if (fd == -1) { 02340 err("failed to open file %s for reading", infile); 02341 } 02342 if (mapsz % pagesz != 0) { 02343 mapsz += pagesz - mapsz % pagesz; 02344 } 02345 data = (const unsigned char *) mmap(NULL, mapsz, PROT_READ, 02346 MAP_SHARED | MAP_NORESERVE, fd, 0); 02347 if ((void *) data == MAP_FAILED) { 02348 err("failed to mmap() file %s", infile); 02349 } 02350 wr = blk_writer_init(outfile, mem->ioblksz); 02351 j = mem->ioblksz * (filesz / mem->ioblksz); 02352 if (j == filesz) { 02353 j -= mem->ioblksz; 02354 } 02355 for (i = filesz - 1; i > 0; --i) { 02356 blk_writer_append(wr, &data[i], 1, NULL); 02357 if (i == j) { 02358 size_t sz = filesz - j; 02359 if (sz > mem->ioblksz) { 02360 sz = mem->ioblksz; 02361 } 02362 madvise((void *) &data[i], sz, MADV_DONTNEED); 02363 j -= mem->ioblksz; 02364 } 02365 } 02366 blk_writer_append(wr, data, 1, NULL); 02367 blk_writer_close(wr, NULL); 02368 if (munmap((void *) data, mapsz) != 0) { 02369 err("munmap() failed"); 02370 } 02371 if (close(fd) != 0) { 02372 err("close() failed"); 02373 } 02374 if (unlink(infile) != 0) { 02375 err("failed to delete file %s", infile); 02376 } 02377 msg("\t%.3f sec total\n\n", now() - t); 02378 } 02379 02380 02381 /* ================================================================ */ 02382 /* Phase 4: Convert spherical coords to unit vectors */ 02383 /* ================================================================ */ 02384 02385 static void spherical_to_vec(const char * const datafile, 02386 const char * const scratchfile, 02387 const struct mem_params * const mem, 02388 const size_t npoints) 02389 { 02390 const struct tree_entry *data; 02391 struct blk_writer *wr; 02392 void *behind; 02393 double t; 02394 size_t i; 02395 int fd; 02396 const size_t pagesz = (size_t) sysconf(_SC_PAGESIZE); 02397 size_t mapsz = sizeof(struct tree_entry) * npoints; 02398 02399 if (mapsz == 0) { 02400 err("Empty data file"); 02401 } 02402 msg("Converting spherical coordinates in %s to unit vectors\n", datafile); 02403 t = now(); 02404 fd = open(datafile, O_RDONLY); 02405 if (fd == -1) { 02406 err("failed to open file %s for reading", datafile); 02407 } 02408 if (mapsz % pagesz != 0) { 02409 mapsz += pagesz - mapsz % pagesz; 02410 } 02411 behind = mmap(NULL, mapsz, PROT_READ, MAP_SHARED | MAP_NORESERVE, fd, 0); 02412 if (behind == MAP_FAILED) { 02413 err("failed to mmap() file %s", datafile); 02414 } 02415 if (madvise(behind, mapsz, MADV_SEQUENTIAL) != 0) { 02416 err("madvise() failed"); 02417 } 02418 data = (const struct tree_entry *) behind; 02419 behind = ((unsigned char *) behind) + mem->ioblksz; 02420 wr = blk_writer_init(scratchfile, mem->ioblksz); 02421 for (i = 0; i < npoints; ++i) { 02422 struct htm_tree_entry e; 02423 if ((void *) &data[i] > behind) { 02424 void *ptr = ((unsigned char *) behind) - mem->ioblksz; 02425 if (madvise(ptr, mem->ioblksz, MADV_DONTNEED) != 0) { 02426 err("madvise() failed"); 02427 } 02428 behind = ((unsigned char *) behind) + mem->ioblksz; 02429 } 02430 htm_sc_tov3(&e.v, &data[i].sc); 02431 e.rowid = data[i].rowid; 02432 blk_writer_append(wr, &e, sizeof(struct htm_tree_entry), NULL); 02433 } 02434 blk_writer_close(wr, NULL); 02435 if (munmap((void *) data, mapsz) != 0) { 02436 err("munmap() failed"); 02437 } 02438 if (close(fd) != 0) { 02439 err("close() failed"); 02440 } 02441 if (rename(scratchfile, datafile) != 0) { 02442 err("failed to rename file %s to %s", scratchfile, datafile); 02443 } 02444 msg("\t%.3f sec total\n\n", now() - t); 02445 } 02446 02447 02448 /* ================================================================ */ 02449 /* main() */ 02450 /* ================================================================ */ 02451 02452 static void usage(const char *prog) { 02453 printf("%s [options] <out_path> <in_file_1> [<in_file_2> ...]\n" 02454 "\n" 02455 " This utility takes character-separated-value files\n" 02456 "containing spherical coordinates, and outputs an HTM tree index\n" 02457 "useful for fast point-in-spherical-region counting. The first\n" 02458 "column of the input files must correspond to unique row IDs\n" 02459 "(64-bit integers), the second to right ascension/longitude angles\n" 02460 "(in degrees), and the third to declination/latitude angles (also\n" 02461 "in degrees). Additional columns are allowed but ignored.\n" 02462 "\n" 02463 " Typically, two output files named <out_path>.htm and\n" 02464 "<out_path>.dat are generated, though the first may optionally\n" 02465 "be omitted.\n" 02466 "\n" 02467 " The first is a tree index, and the second contains points\n" 02468 "sorted by level 20 HTM ID. If the inputs contain no more than a\n" 02469 "configurable number of points, the tree index is omitted. When is\n" 02470 "this useful? For small data-sets, scanning the point file and\n" 02471 "testing each point for spherical-region membership can be faster\n" 02472 "than searching a tree. This is because tree searching has some\n" 02473 "overhead, and for sufficiently small data sets, a single page\n" 02474 "on disk will contain a significant fraction of the points. In\n" 02475 "this case, using a tree to reduce the number of points tested\n" 02476 "against a region fails to reduce IO compared to a scan.\n" 02477 "\n" 02478 "== Options ====\n" 02479 "\n" 02480 "--help |-h : Prints usage information.\n" 02481 "--blk-size |-b <int> : IO block size in KiB. The default is\n" 02482 " 1024 KiB.\n" 02483 "--delim |-d <char> : The separator character to use when\n" 02484 " parsing input files. The default is '|'.\n" 02485 "--max-mem |-m <int> : Approximate memory usage limit in MiB.\n" 02486 " The default is 512 MiB. Note that this\n" 02487 " applies only to various external sorts,\n" 02488 " and that tree generation can use up to\n" 02489 " about 512MiB of memory regardless.\n" 02490 "--tree-min |-t <int> : If the total number of input points does\n" 02491 " not exceed the specified number, tree\n" 02492 " index generation is skipped. The default\n" 02493 " is 1024.\n" 02494 "--leaf-thresh |-l <int> : Minimum number of points in an internal\n" 02495 " tree node; defaults to 64.\n", 02496 prog); 02497 } 02498 02499 02500 int main(int argc, char **argv) 02501 { 02502 struct mem_params mem; 02503 char **infile; 02504 char *treefile, *datafile, *scratch; 02505 int nfile; 02506 size_t len, npoints, nnodes; 02507 size_t memsz = 512*1024*1024; 02508 size_t ioblksz = 1024*1024; 02509 size_t minpoints = 1024; 02510 uint64_t leafthresh = 64; 02511 char delim = '|'; 02512 02513 while (1) { 02514 static struct option long_options[] = { 02515 { "help", no_argument, 0, 'h' }, 02516 { "blk-size", required_argument, 0, 'b' }, 02517 { "delim", required_argument, 0, 'd' }, 02518 { "max-mem", required_argument, 0, 'm' }, 02519 { "tree-min", required_argument, 0, 't' }, 02520 { "leaf-thresh", required_argument, 0, 'l' }, 02521 { 0, 0, 0, 0 } 02522 }; 02523 unsigned long long v; 02524 char *endptr; 02525 int option_index = 0; 02526 int c = getopt_long(argc, argv, "hb:d:l:m:t:", 02527 long_options, &option_index); 02528 if (c == -1) { 02529 break; /* no more options */ 02530 } 02531 switch (c) { 02532 case 'h': 02533 usage(argv[0]); 02534 return EXIT_SUCCESS; 02535 case 'b': 02536 v = strtoull(optarg, &endptr, 0); 02537 if (endptr == optarg || errno != 0 || v < 1 || v > 1024*1024) { 02538 err("--blk-size invalid. Please specify an integer between " 02539 "1 and 1,048,576 (KiB)."); 02540 } 02541 ioblksz = (size_t) v*1024; 02542 break; 02543 case 'd': 02544 if (strlen(optarg) != 1) { 02545 err("--delim|-d argument must be a single character"); 02546 } 02547 delim = optarg[0]; 02548 if (delim == '\0' || 02549 strchr("\n.+-eE0123456789aAfFiInN", delim) != NULL) { 02550 err("--delim invalid: '%c'", delim); 02551 } 02552 break; 02553 case 'l': 02554 v = strtoull(optarg, &endptr, 0); 02555 if (endptr == optarg || errno != 0 || v < 1 || v > 1024*1024) { 02556 err("--leaf-max invalid. Please specify an integer between " 02557 "1 and 1,048,576."); 02558 } 02559 leafthresh = (uint64_t) v; 02560 break; 02561 case 'm': 02562 v = strtoull(optarg, &endptr, 0); 02563 if (endptr == optarg || errno != 0 || 02564 v < 1 || v > SIZE_MAX/2097152) { 02565 err("--max-mem invalid. Please specify an integer between " 02566 "1 and %llu (MiB).", 02567 (unsigned long long) SIZE_MAX/2097152); 02568 } 02569 memsz = (size_t) v*1024*1024; 02570 break; 02571 case 't': 02572 v = strtoull(optarg, &endptr, 0); 02573 if (endptr == optarg || errno != 0 || v > SIZE_MAX) { 02574 err("--tree-min invalid. Please specify a non-negative " 02575 "integer less than or equal to %llu", 02576 (unsigned long long) SIZE_MAX); 02577 } 02578 minpoints = (size_t) v; 02579 break; 02580 case '?': 02581 return EXIT_FAILURE; 02582 default: 02583 abort(); 02584 } 02585 } 02586 if (argc - optind < 2) { 02587 err("Output file and at least one input file must be specified"); 02588 } 02589 len = strlen(argv[optind]); 02590 treefile = (char *) malloc(len + 5); 02591 datafile = (char *) malloc(len + 5); 02592 scratch = (char *) malloc(len + 5); 02593 if (treefile == NULL || datafile == NULL || scratch == NULL) { 02594 err("malloc() failed"); 02595 } 02596 strcpy(treefile, argv[optind]); 02597 strcpy(datafile, argv[optind]); 02598 strcpy(scratch, argv[optind]); 02599 strcat(treefile, ".htm"); 02600 strcat(datafile, ".dat"); 02601 strcat(scratch, ".scr"); 02602 02603 ++optind; 02604 infile = &argv[optind]; 02605 nfile = argc - optind; 02606 mem_params_init(&mem, memsz, ioblksz); 02607 02608 printf("\n"); 02609 02610 /* Phase 1: produce sorted point file from ASCII inputs */ 02611 npoints = blk_sort_ascii(infile, nfile, datafile, delim, &mem); 02612 ext_sort(datafile, scratch, &mem, sizeof(struct tree_entry), npoints, 02613 &tree_entry_cmp); 02614 02615 if (npoints > minpoints) { 02616 struct tree_root super; 02617 uint64_t filesz; 02618 /* Phase 2: produce sorted tree nodes from data file */ 02619 nnodes = tree_gen(datafile, treefile, &mem, &super, 02620 leafthresh, npoints); 02621 ext_sort(treefile, scratch, &mem, sizeof(struct disk_node), nnodes, 02622 &disk_node_cmp); 02623 /* Phase 3: compress tree file */ 02624 filesz = tree_compress(treefile, scratch, &mem, &super, 02625 nnodes, leafthresh); 02626 reverse_file(scratch, treefile, &mem, filesz); 02627 } 02628 /* Phase 4: convert spherical coords to unit vectors */ 02629 spherical_to_vec(datafile, scratch, &mem, npoints); 02630 free(scratch); 02631 free(datafile); 02632 free(treefile); 02633 return EXIT_SUCCESS; 02634 } 02635 02638 #ifdef __cplusplus 02639 } 02640 #endif