CTTL on
SourceForge |
Download
Latest |
Documentation
Index |
Library
News |
CVS
Repository |
Other
Links |
A utility to divide a large text file, such as this sample trace output, into fractions of 100MB-chunks of text.
// partition_file.cpp // utility to partition file // arguments: full_path\file N // note that full path is required; N is number of parts. // expect last part to be first-part-size <= last-part-size <= 2*first-part-size //#define NDEBUG // define before assert.h to stop assertions from being compiled //#define CTTL_TRACE_EVERYTHING //define to turn tracing on //#define CTTL_TRACE_RULES //define to turn light tracing on #include <iostream> #include "cttl/cttl.h" #include "utils/fileio.h" #include "utils/itos.h" using namespace cttl; size_t partition_file( input<>& inp_, const_edge<>& universe_, size_t part_start_, size_t parts_requested_, const std::string& base_directory_, const std::string& base_file_name_ ) { assert( inp_.length() ); assert( parts_requested_ < inp_.length() ); static size_t part_size = inp_.length() / parts_requested_; // part size determined only once assert( part_size ); // std::cout << "part size: " << part_size << std::endl; if ( part_size >= inp_.length() ) { string2file( base_directory_ + "part_" + itos( part_start_ ) + '_' + base_file_name_, universe_ ); std::cout << "\t wrote: " << ( "part_" + itos( part_start_++ ) + '_' + base_file_name_ ) << std::endl; } else { size_t begin_part = 0; size_t end_part = part_size; for ( ;; ) { universe_.first.offset( begin_part ); universe_.first.go_line_home(); universe_.second.offset( end_part ); begin_part = universe_.second.go_line_home(); // adjust if last part is small; add last part to the previous part: if ( inp_.length() < begin_part + part_size ) { end_part = universe_.second.go_eof(); assert( end_part == inp_.length() ); } string2file( base_directory_ + "part_" + itos( part_start_ ) + '_' + base_file_name_, universe_ ); std::cout << "\t wrote: " << ( "part_" + itos( part_start_++ ) + '_' + base_file_name_ ) << std::endl; if ( end_part == inp_.length() ) break; end_part = std::min( inp_.length(), begin_part + part_size ); } } inp_.text( "" ); return part_start_; } const size_t item_count = 1024*1024; // read up to 1MB at a time const size_t max_line_size = item_count + 1024; // allow 16k for the line size char line_buffer[ max_line_size ]; // buffer for reading the data in const size_t item_size = 1; // read items of 1 byte in size const size_t max_inp_length = item_count * 100; // do not exceed 100MB when reading input int main(int argc, char* argv[]) { if ( argc != 3 ) { std::cout << "Usage: " << std::endl << '\t' << argv[ 0 ] << " full_path/file.txt N" << std::endl << "note that full path is required; N is number of parts." << std::endl << "expect last part to be N <= last-part-size <= 2*N" << std::endl ; return 0; } // step 1: get base file name and directory input<> inp( argv[ 1 ] ); assert( inp.length() ); const_edge<> universe( new_edge( inp ) ); universe.first.go_bof(); universe.second.go_eof(); if ( ( *quote( true, symbol( true ), symbol( '\\' ) | symbol( '/' ) ) ).match( universe ) == std::string::npos ) { std::cout << "*** FAILED TO PARSE FILE NAME ***" << std::endl; return 1; } std::string base_file_name( universe ); universe.second.offset( universe.first.offset() ); universe.first.go_bof(); std::string base_directory( universe ); std::cout << "\t: reading " << base_directory << base_file_name << std::endl; // step 2: partition file size_t parts_requested = atoi( argv[ 2 ] ); assert( parts_requested ); FILE *file_stream; inp.text( "" ); size_t part_start = 1; if( ( file_stream = fopen( ( base_directory + base_file_name ).c_str(), "r" ) ) != NULL ) { // while ( fgets( line_buffer, max_line_size, file_stream ) != NULL ) { size_t numread = 0; while ( ( numread = fread( line_buffer, item_size, item_count, file_stream ) ) ) { line_buffer[ numread ] = 0x00; inp.text() += line_buffer; if ( inp.length() > max_inp_length ) { // chunk of text that needs to be processed part_start = partition_file( inp, universe, part_start, parts_requested, base_directory, base_file_name ); std::cout << "\t: reading " << base_directory << base_file_name << std::endl; } } assert( feof( file_stream ) );//returns 0 if the current position is not end of file fclose( file_stream ); } if ( inp.length() ) { // if file size did not exceed 100MB part_start = partition_file( inp, universe, part_start, parts_requested, base_directory, base_file_name ); } return 0; }
Permission to copy, use, modify, sell and distribute this document is granted provided this copyright notice appears in all copies. This document is provided "as is" without express or implied warranty, and with no claim as to its suitability for any purpose.