package Clair::SyntheticCollection;

=head1 NAME

Clair::SyntheticCollection  

=cut

=head1 new

SyntheticCollection
	- new (string Name, Distribution content_dist, 
 	                    Distribution length_dist,
 	                    integer number_of_docs)

 Some examples:
 my $a = SyntheticCollection->new (name => "name",
				    term_map => \@ranks2terms,
				    term_dist => $term_dist,
				    doclen_dist => $doclen_dist,
				    doclen_map => \@doclen_map,
				    size => $number_of_docs);

 my $a = SyntheticCollection->new (name => "name",
				    term_map => \@ranks2terms,
				    term_dist => $term_dist,
				    doc_length => 100,
				    size => $number_of_docs);


 Needs:
 1. Base Dir
 2. Synthetic Collection Output Dir

 Sat Mar 12 19:37:19 EST 2005

=cut

use strict;
use Carp;
#my $SYNTH_COL_BASE	= "$ENV{PERLTREE_HOME}/synth_collections";

sub new {
  my $class = shift;
  my %params = @_;
  my $self = bless { %params }, $class;

  # Verify params
  # Required arg: name, mode
  unless ((exists $params{mode}) && (exists $params{name})
	  && (exists $params{base})) {
    croak
      "SyntheticCollection ctor requires \"mode\", \"name\" and \"base\" arguments\n";
  }

  # Populate a few essential fields
#  $self->{collection_base} = "$SYNTH_COL_BASE/$params{name}";
  $self->{collection_base} = $params{base};
  $self->{docs_dir}        = "$self->{collection_base}/raw_docs";
  $self->{stats_dir}       = "$self->{collection_base}/stats";
  $self->{cosine_file}     = "$self->{collection_base}/$self->{name}.cos";
  $self->{file_list}       = "$self->{collection_base}/$self->{name}.files";

  # If read only, all we need is a collection name
  # To create new, we need much more
  unless ($params{mode} eq "read_only") {

    # Required args for creating a new collection: 
    #    term_map, term_dist, size
    unless ((exists $params{term_map}) &&
            (exists $params{term_dist}) &&
            (exists $params{size})) {
      croak
        "SyntheticCollection ctor requires the following args to create
         a new synthetic collection:
	term_map	=> array reference mapping ranks to terms
	term_dist	=> a RandomDistribution object
	size		=> number of documents to generate\n\n";
    }

    # Mutually Exclusively Required Args for creating new collection:
    #  (doclen_dist, doclen_map) || doc_length
    unless ((exists $params{doclen_dist}) &&
            (exists $params{doclen_map})) {
      unless (exists $params{doc_length}) {
        croak "Must specify (doclen_dist and doclen_map) OR doc_length\n";
      }
    } else {
      # make sure doc_length was not specified
      if (exists $params{doc_length}) {
        croak "Must specify (doclen_dist and doclen_map) OR doc_length\n" .
              "Not both.\n";
      }
    }

    # If we get here, we have all the parameters necessary to create
    #  a new collection.
    unless (-d $self->{collection_base}) {
      mkdir ($self->{collection_base}, 0775) ||
	croak "Could not create directory $self->{collection_base}\n";
    }
  } else {
    # Need to populate fields from file
    open (NUMDOCS, "$self->{stats_dir}/num_docs.txt") ||
      croak "Could not open file $self->{stats_dir}/num_docs.txt\n";
    $self->{size} = <NUMDOCS>;
    # DEBUG
    # print "Collection Size: $self->{size}\n";
    # /DEBUG
    close (NUMDOCS);
  }

  return $self;
}

sub create_documents {
  my $self = shift;

  # Only proceed if our mode is not "read_only"
  if ($self->{mode} eq "read_only") {
    croak "Cannot create documents: mode is \"read_only\"\n\n";
  }

  # Make sure there is not an existing synth collection with this name
  if (-d $self->{docs_dir}) {
    # report error
    croak "Collection $self->{name} already exists\n";
  }

  # Create our document directory
  mkdir ($self->{docs_dir}, 0775) ||
    croak "Could not create directory $self->{docs_dir}\n";

  # create files by iterating from $self->{size} to 0
  my $docs_itor = $self->{size};
  my $cur_doclen;

  my $term_dist = $self->{term_dist};
  my $term_map = $self->{term_map};

  my $doclen_map = $self->{doclen_map};
  my ($uniform_doclen, $doclen_dist) =
     ($self->{doc_length}, $self->{doclen_dist});
  my $mirror_doclen = $self->{mirror_doclen};

  # Keep a list of filenames
  open (FILES, ">$self->{file_list}") ||
    croak "Could not create file $self->{file_list}\n";

  while ($docs_itor--) {
    # Get a document length for this doc
    if ($mirror_doclen) {
      # Mirror the existing document length distribution
      $cur_doclen = $doclen_map->[$docs_itor];
    } else {
      # Otherwise uniform document lengt or from distribution
      $cur_doclen = ($uniform_doclen ||
                     $doclen_map->[$doclen_dist->draw_rand_from_dist()]);
    }

    # Create the output file
    open (OUT, ">$self->{docs_dir}/synth.$docs_itor") ||
     croak "Could not create file $self->{docs_dir}/synth.$docs_itor\n";

    # Generate random terms based on term dist
    while ($cur_doclen--) {
      my $r = $term_dist->draw_rand_from_dist();
      my $term = $term_map->[$r-1];
      print OUT $term;
      print OUT " ";
#      print STDERR "r: $r, term: $term\n";
    }
    print OUT "\n";

    close (OUT);

    # Add this file to the list
    print FILES "synth.$docs_itor\n";
  }

  close (FILES);

  # Make sure stats dir exists
  unless (-d $self->{stats_dir}) {
    mkdir ($self->{stats_dir}, 0775) || 
      croak "Could not create directory $self->{stats_dir}\n";
  }

  # Store the file count
  open (NUMDOCS, ">$self->{stats_dir}/num_docs.txt") ||
    croak "Could not create file $self->{stats_dir}/num_docs.txt\n";
  print NUMDOCS $self->{size};
  close (NUMDOCS);
}

1;
