Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

partition_file.cpp


SourceForge.net Logo     CTTL on    
    SourceForge    
    Download    
    Latest    
    Documentation    
    Index    
    Library    
    News    
    CVS    
    Repository    
   Other    
   Links    

A utility to divide a large text file, such as this sample trace output, into fractions of 100MB-chunks of text.

// partition_file.cpp
// utility to partition file
// arguments: full_path\file N
// note that full path is required; N is number of parts.
// expect last part to be first-part-size <= last-part-size <= 2*first-part-size

//#define NDEBUG    // define before assert.h to stop assertions from being compiled 
//#define CTTL_TRACE_EVERYTHING //define to turn tracing on
//#define CTTL_TRACE_RULES  //define to turn light tracing on

#include <iostream>
#include "cttl/cttl.h"
#include "utils/fileio.h"
#include "utils/itos.h"

using namespace cttl;

size_t partition_file(
                   input<>& inp_,
                   const_edge<>& universe_,
                   size_t part_start_,
                   size_t parts_requested_,
                   const std::string& base_directory_,
                   const std::string& base_file_name_
                   )
{
    assert( inp_.length() );
    assert( parts_requested_ < inp_.length() );
    static size_t part_size = inp_.length() / parts_requested_; // part size determined only once
    assert( part_size );
//  std::cout << "part size: " << part_size << std::endl;

    if ( part_size >= inp_.length() ) {
        string2file( base_directory_ + "part_" + itos( part_start_ ) + '_' + base_file_name_, universe_ );
        std::cout << "\t wrote: " << ( "part_" + itos( part_start_++ ) + '_' + base_file_name_ ) << std::endl;

    } else {

        size_t begin_part = 0;
        size_t end_part = part_size;

        for ( ;; ) {
            universe_.first.offset( begin_part );
            universe_.first.go_line_home();

            universe_.second.offset( end_part );
            begin_part = universe_.second.go_line_home();

            // adjust if last part is small; add last part to the previous part:
            if ( inp_.length() < begin_part + part_size ) {
                end_part = universe_.second.go_eof();
                assert( end_part == inp_.length() );
            }

            string2file( base_directory_ + "part_" + itos( part_start_ ) + '_' + base_file_name_, universe_ );
            std::cout << "\t wrote: " << ( "part_" + itos( part_start_++ ) + '_' + base_file_name_ ) << std::endl;
            if ( end_part == inp_.length() )
                break;

            end_part = std::min( inp_.length(), begin_part + part_size );
        }
    }

    inp_.text( "" );
    return part_start_;
}

const size_t item_count = 1024*1024;            // read up to 1MB at a time
const size_t max_line_size = item_count + 1024; // allow 16k for the line size
char line_buffer[ max_line_size ];              // buffer for reading the data in
const size_t item_size = 1;                     // read items of 1 byte in size
const size_t max_inp_length = item_count * 100; // do not exceed 100MB when reading input

int main(int argc, char* argv[])
{
    if ( argc != 3 ) {
        std::cout
            << "Usage: "
            << std::endl
            << '\t'
            << argv[ 0 ]
            << " full_path/file.txt N"
            << std::endl
            << "note that full path is required; N is number of parts."
            << std::endl
            << "expect last part to be N <= last-part-size <= 2*N"
            << std::endl
            ;
        return 0;
    }

    // step 1: get base file name and directory
    input<> inp( argv[ 1 ] );
    assert( inp.length() );
    const_edge<> universe( new_edge( inp ) );
    universe.first.go_bof();
    universe.second.go_eof();

    if ( ( *quote( true, symbol( true ), symbol( '\\' ) | symbol( '/' ) ) ).match( universe ) == std::string::npos ) {
        std::cout << "*** FAILED TO PARSE FILE NAME ***" << std::endl;
        return 1;
    }

    std::string base_file_name( universe );
    universe.second.offset( universe.first.offset() );
    universe.first.go_bof();
    std::string base_directory( universe );
    std::cout << "\t: reading " << base_directory << base_file_name << std::endl;

    // step 2: partition file

    size_t parts_requested = atoi( argv[ 2 ] );
    assert( parts_requested );

    FILE *file_stream;

    inp.text( "" );
    size_t part_start = 1;
    if( ( file_stream = fopen( ( base_directory + base_file_name ).c_str(), "r" ) ) != NULL )
    {
//      while ( fgets( line_buffer, max_line_size, file_stream ) != NULL ) {
        size_t numread = 0;
        while ( ( numread = fread( line_buffer, item_size, item_count, file_stream ) ) ) {
            line_buffer[ numread ] = 0x00;
            inp.text() += line_buffer;
            if ( inp.length() > max_inp_length ) {
                // chunk of text that needs to be processed
                part_start = partition_file(
                   inp,
                   universe,
                   part_start,
                   parts_requested,
                   base_directory,
                   base_file_name
                   );

                std::cout << "\t: reading " << base_directory << base_file_name << std::endl;
            }
        }
        assert( feof( file_stream ) );//returns 0 if the current position is not end of file
        fclose( file_stream );
    }
    if ( inp.length() ) {
        // if file size did not exceed 100MB
        part_start = partition_file(
            inp,
            universe,
            part_start,
            parts_requested,
            base_directory,
            base_file_name
            );
    }

    return 0;
}



Copyright © 1997-2006 Igor Kholodov mailto:cttl@users.sourceforge.net.

Permission to copy, use, modify, sell and distribute this document is granted provided this copyright notice appears in all copies. This document is provided "as is" without express or implied warranty, and with no claim as to its suitability for any purpose.


Generated on Thu Nov 2 17:44:56 2006 for Common Text Transformation Library by  doxygen 1.3.9.1