////////////////////////////////////////////////////////////////////////
//
// This file is part of Common Text Transformation Library.
// Copyright (C) 1997-2009 by Igor Kholodov.
//
// Common Text Transformation Library is free software: you can
// redistribute it and/or modify it under the terms of the
// GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Common Text Transformation Library is distributed in the hope
// that it will be useful, but WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with Common Text Transformation Library.
// If not, see .
//
// mailto:cttl@users.sourceforge.net
// http://cttl.sourceforge.net/
// http://sourceforge.net/projects/cttl/
//
////////////////////////////////////////////////////////////////////////
// partition_file.cpp
// utility to partition file
// arguments: full_path\file N
// note that full path is required; N is number of parts.
// expect last part to be first-part-size <= last-part-size <= 2*first-part-size
//#define NDEBUG // define before assert.h to stop assertions from being compiled
//#define CTTL_TRACE_EVERYTHING //define to turn tracing on
//#define CTTL_TRACE_RULES //define to turn light tracing on
#include
#include "cttl/cttl.h"
#include "utils/fileio.h"
//#include "utils/itos.h"
#include "utils/string_format.h"
using namespace cttl;
size_t partition_file(
std::string& inp_,
const_edge<>& substr_,
size_t part_start_,
size_t parts_requested_,
const std::string& base_directory_,
const std::string& base_file_name_
)
{
assert( inp_.length() );
assert( parts_requested_ < inp_.length() );
static size_t part_size = inp_.length() / parts_requested_; // part size determined only once
assert( part_size );
// std::cout << "part size: " << part_size << std::endl;
if ( part_size >= inp_.length() ) {
string2file( base_directory_ + "part_" + type2string( part_start_ ) + '_' + base_file_name_, substr_.text() );
std::cout << "\t wrote: " << ( "part_" + type2string( part_start_++ ) + '_' + base_file_name_ ) << std::endl;
} else {
size_t begin_part = 0;
size_t end_part = part_size;
for ( ;; ) {
substr_.first.offset( begin_part );
substr_.first.go_line_home();
substr_.second.offset( end_part );
begin_part = substr_.second.go_line_home();
// adjust if last part is small; add last part to the previous part:
if ( inp_.length() < begin_part + part_size ) {
end_part = substr_.second.go_eof();
assert( end_part == inp_.length() );
}
string2file( base_directory_ + "part_" + type2string( part_start_ ) + '_' + base_file_name_, substr_.text() );
std::cout << "\t wrote: " << ( "part_" + type2string( part_start_++ ) + '_' + base_file_name_ ) << std::endl;
if ( end_part == inp_.length() )
break;
end_part = std::min( inp_.length(), begin_part + part_size );
}
}
inp_ = "";
return part_start_;
}
const size_t item_count = 1024*1024; // read up to 1MB at a time
const size_t max_line_size = item_count + 1024; // allow 16k for the line size
char line_buffer[ max_line_size ]; // buffer for reading the data in
const size_t item_size = 1; // read items of 1 byte in size
const size_t max_inp_length = item_count * 100; // do not exceed 100MB when reading input
int main(int argc, char* argv[])
{
if ( argc != 3 ) {
std::cout
<< "Usage: "
<< std::endl
<< '\t'
<< argv[ 0 ]
<< " full_path/file.txt N"
<< std::endl
<< "note that full path is required; N is number of parts."
<< std::endl
<< "expect last part to be N <= last-part-size <= 2*N"
<< std::endl
;
return 0;
}
// step 1: get base file name and directory
std::string inp = argv[ 1 ];
assert( inp.length() );
const_edge<> substring( inp );
substring.first.go_bof();
substring.second.go_eof();
if ( ( *quote( true, symbol( true ), symbol( '\\' ) | symbol( '/' ) ) ).match( substring ) == std::string::npos ) {
std::cout << "*** FAILED TO PARSE FILE NAME ***" << std::endl;
return 1;
}
std::string base_file_name( substring );
substring.second.offset( substring.first.offset() );
substring.first.go_bof();
std::string base_directory( substring );
std::cout << "\t: reading " << base_directory << base_file_name << std::endl;
// step 2: partition file
size_t parts_requested = atoi( argv[ 2 ] );
assert( parts_requested );
FILE *file_stream;
inp = "";
size_t part_start = 1;
if( ( file_stream = fopen( ( base_directory + base_file_name ).c_str(), "r" ) ) != NULL )
{
// while ( fgets( line_buffer, max_line_size, file_stream ) != NULL ) {
size_t numread = 0;
while ( ( numread = fread( line_buffer, item_size, item_count, file_stream ) ) ) {
line_buffer[ numread ] = 0x00;
inp += line_buffer;
if ( inp.length() > max_inp_length ) {
// chunk of text that needs to be processed
part_start = partition_file(
inp,
substring,
part_start,
parts_requested,
base_directory,
base_file_name
);
std::cout << "\t: reading " << base_directory << base_file_name << std::endl;
}
}
assert( feof( file_stream ) );//returns 0 if the current position is not end of file
fclose( file_stream );
}
if ( inp.length() ) {
// if file size did not exceed 100MB
part_start = partition_file(
inp,
substring,
part_start,
parts_requested,
base_directory,
base_file_name
);
}
return 0;
}