svndigest - svndigest

plugins/base1/se.lu.onk.cgh_dataDumper/trunk/cgh_dataDumper.pl

: Code
: Comments
: Other

Rev	Date	Author	Line
720	11 Jun 08	jari	1	#!/usr/bin/perl
720	11 Jun 08	jari	2
720	11 Jun 08	jari	3	# $Id$
720	11 Jun 08	jari	4
720	11 Jun 08	jari	5	# Document name: cgh_dataDumper.pl
720	11 Jun 08	jari	6	# cgh_dataDumper.pl version 2.0
720	11 Jun 08	jari	7	#
720	11 Jun 08	jari	8	#
720	11 Jun 08	jari	9	# Copyright (C) 2004 Johan Staaf
720	11 Jun 08	jari	10
720	11 Jun 08	jari	11	# cgh_dataDumper.pl is free software; you can redistribute it and/or
720	11 Jun 08	jari	12	# modify it under the terms of the GNU General Public License
720	11 Jun 08	jari	13	# as published by the Free Software Foundation; either version 2
720	11 Jun 08	jari	14	# of the License, or (at your option) any later version.
720	11 Jun 08	jari	15
720	11 Jun 08	jari	16	# This program is distributed in the hope that it will be useful,
720	11 Jun 08	jari	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
720	11 Jun 08	jari	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
720	11 Jun 08	jari	19	# GNU General Public License for more details.
720	11 Jun 08	jari	20
720	11 Jun 08	jari	21	# You should have received a copy of the GNU General Public License
720	11 Jun 08	jari	22	# along with this program; if not, write to the Free Software
720	11 Jun 08	jari	23	# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
720	11 Jun 08	jari	24
720	11 Jun 08	jari	25	# johan.staaf@onk.lu.se
720	11 Jun 08	jari	26	# Johan Staaf, Dept Oncology, Lund University, S-221 85 Lund, Sweden
720	11 Jun 08	jari	27
720	11 Jun 08	jari	28	use strict;
720	11 Jun 08	jari	29	use FindBin ();
720	11 Jun 08	jari	30
720	11 Jun 08	jari	31	# VER 1.0 of cgh_dataDumper.pl
720	11 Jun 08	jari	32
720	11 Jun 08	jari	33
720	11 Jun 08	jari	34	######## BASE file requirements #########
720	11 Jun 08	jari	35	# The BASEfile format should be normal.
720	11 Jun 08	jari	36	# expect per row:
720	11 Jun 08	jari	37	# [reporterId] [geneSymbol] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M]
720	11 Jun 08	jari	38
720	11 Jun 08	jari	39
720	11 Jun 08	jari	40	############################################
720	11 Jun 08	jari	41	############# MAIN PROGRAM #############
720	11 Jun 08	jari	42	############################################
720	11 Jun 08	jari	43
720	11 Jun 08	jari	44
720	11 Jun 08	jari	45	### Main Variables ###
720	11 Jun 08	jari	46	my (@final);
720	11 Jun 08	jari	47	my (%param_hash,%base_hash,%bacToGene);
720	11 Jun 08	jari	48	my @annoStat;
720	11 Jun 08	jari	49	#################
720	11 Jun 08	jari	50
720	11 Jun 08	jari	51	## Read the file and extract BASE column data #######
720	11 Jun 08	jari	52
720	11 Jun 08	jari	53	$/="%"; #Gives every section as one lump
720	11 Jun 08	jari	54	my $i=0;
720	11 Jun 08	jari	55
720	11 Jun 08	jari	56	while(<>){
720	11 Jun 08	jari	57	$i++;
720	11 Jun 08	jari	58	chomp;
720	11 Jun 08	jari	59	if($i==1){ #Parameters
720	11 Jun 08	jari	60
720	11 Jun 08	jari	61	$_ =~ /filename\t(.+)\n/;
720	11 Jun 08	jari	62	$param_hash{'filename'}=$1;
720	11 Jun 08	jari	63	$_ =~ /sort\t(\w+)\n/;
720	11 Jun 08	jari	64	$param_hash{'sort'}=$1;
720	11 Jun 08	jari	65	$_ =~ /exportMode\t(\w+)\n/;
720	11 Jun 08	jari	66	$param_hash{'exportMode'}=$1;
720	11 Jun 08	jari	67	$_ =~ /annotationType\t(.+)\n/;
720	11 Jun 08	jari	68	$param_hash{'annotationType'}=$1;
720	11 Jun 08	jari	69
720	11 Jun 08	jari	70	$_ =~ /annoForStat\t(.+)\n/; #in format \|ER Status\|p_brca_family_status\|etc..\|
720	11 Jun 08	jari	71	$param_hash{'annoForStat'}=$1;
720	11 Jun 08	jari	72	my @tmp=split(/\\|/,$param_hash{'annoForStat'}); #holding the annotations to get stats for!
720	11 Jun 08	jari	73	foreach my $line (@tmp){
720	11 Jun 08	jari	74	if($line ne ""){
720	11 Jun 08	jari	75	push(@annoStat,$line);
720	11 Jun 08	jari	76	}
720	11 Jun 08	jari	77	}
720	11 Jun 08	jari	78	}elsif($i==2){ #section assays info
720	11 Jun 08	jari	79	$_=~ /count\t(\d*)/;
720	11 Jun 08	jari	80	$base_hash{'NumberOfAssays'}=$1;
720	11 Jun 08	jari	81	$_=~ /columns\t(.*)/;
720	11 Jun 08	jari	82	$base_hash{'AssayColumns'}=$1;
720	11 Jun 08	jari	83	$_=~ /annotationColumns\t(.*)/;
720	11 Jun 08	jari	84	$base_hash{'AnnotationColumns'}=$1;
720	11 Jun 08	jari	85	}elsif($i==3){ #assay info + section spots
720	11 Jun 08	jari	86	$_=~ /columns\t(.*)\tassayData/;
720	11 Jun 08	jari	87	$base_hash{'Columns'}=$1;
720	11 Jun 08	jari	88	$_=~ /channels\t(.*)/;
720	11 Jun 08	jari	89	$base_hash{'channels'}=$1;
720	11 Jun 08	jari	90	$_=~/assayFields\t(.*)/;
720	11 Jun 08	jari	91	$base_hash{'assayFields'}=$1;
720	11 Jun 08	jari	92	(my $text,my $dump)=split("\n\n",$_);
720	11 Jun 08	jari	93	$text=~ s/^\n//;
720	11 Jun 08	jari	94	$base_hash{'Assays'}=$text;
720	11 Jun 08	jari	95	last; #Dont read in the large bulk of data now!
720	11 Jun 08	jari	96	}
720	11 Jun 08	jari	97	}
720	11 Jun 08	jari	98	#### OBSERVE, all base headers now printed to STDOUT!
720	11 Jun 08	jari	99
720	11 Jun 08	jari	100
720	11 Jun 08	jari	101
720	11 Jun 08	jari	102	## Do export of data points, global boolean
720	11 Jun 08	jari	103	my $doExport="TRUE";
720	11 Jun 08	jari	104
720	11 Jun 08	jari	105
720	11 Jun 08	jari	106	#### Locate the assay field columns for each annotation group to extract data from
720	11 Jun 08	jari	107	my @assays = split("\n",$base_hash{'Assays'});
720	11 Jun 08	jari	108	my @assaynames = ();
720	11 Jun 08	jari	109	foreach my $line (@assays) {
720	11 Jun 08	jari	110	my @tmp=split("\t",$line);
720	11 Jun 08	jari	111	push(@assaynames,$tmp[1]); #keep only the assay name in the order sent to the plugin
720	11 Jun 08	jari	112	}
720	11 Jun 08	jari	113
720	11 Jun 08	jari	114	if($param_hash{'exportMode'} eq "sampleName_annotation"){
720	11 Jun 08	jari	115	my @sampleannotations;
720	11 Jun 08	jari	116
720	11 Jun 08	jari	117	### Assign an assay to an annotation type and save in @control vector
720	11 Jun 08	jari	118	my @base_annotations=split("\t",$base_hash{'AnnotationColumns'});
720	11 Jun 08	jari	119	my $annotation_index= -1; #tracks which column index to use to get the annotation values
720	11 Jun 08	jari	120	for (my $i=0;$i<@base_annotations;$i++){
720	11 Jun 08	jari	121	if($param_hash{'annotationType'} eq $base_annotations[$i]){ #found the right annotation!
720	11 Jun 08	jari	122	$annotation_index=$i;
720	11 Jun 08	jari	123	last;
720	11 Jun 08	jari	124	}
720	11 Jun 08	jari	125	}
720	11 Jun 08	jari	126	for (my $i=0;$i<@assays;$i++) {
720	11 Jun 08	jari	127	my @tmp=split("\t",$assays[$i]);
720	11 Jun 08	jari	128	if($annotation_index > -1){ # a real annotation index found
720	11 Jun 08	jari	129	if($tmp[$annotation_index+2] eq ""){ #[assaynbr] [assayname] then annotations, thats why +2
720	11 Jun 08	jari	130	$sampleannotations[$i]="NONE"; #Na for annotation for this assay!
720	11 Jun 08	jari	131	}else{
720	11 Jun 08	jari	132	$sampleannotations[$i]=$tmp[$annotation_index+2]; #a real annotation
720	11 Jun 08	jari	133	}
720	11 Jun 08	jari	134	}else{
720	11 Jun 08	jari	135	#basically this means no annotation analysis from start, use Na instead in control file
720	11 Jun 08	jari	136	$sampleannotations[$i]="NONE"; #Na for annotation for this assay!
720	11 Jun 08	jari	137
720	11 Jun 08	jari	138	}
720	11 Jun 08	jari	139	}
720	11 Jun 08	jari	140
720	11 Jun 08	jari	141	### print the binding of @assaynames, @sampleannotations
720	11 Jun 08	jari	142	open(myfile,">SampleNames_and_selectedAnnotation.txt");
720	11 Jun 08	jari	143	for (my $i=0;$i<@assaynames;$i++) {
720	11 Jun 08	jari	144	print myfile "$assaynames[$i]\t$sampleannotations[$i]\n";
720	11 Jun 08	jari	145	}
720	11 Jun 08	jari	146	close(myfile);
720	11 Jun 08	jari	147	$doExport="FALSE";
720	11 Jun 08	jari	148	}elsif($param_hash{'exportMode'} eq "sampleName"){
720	11 Jun 08	jari	149
720	11 Jun 08	jari	150	### print @assaynames
720	11 Jun 08	jari	151	open(myfile,">SampleNames.txt");
720	11 Jun 08	jari	152	for (my $i=0;$i<@assaynames;$i++) {
720	11 Jun 08	jari	153	print myfile "$assaynames[$i]\n";
720	11 Jun 08	jari	154	}
720	11 Jun 08	jari	155	close(myfile);
720	11 Jun 08	jari	156	$doExport="FALSE";
720	11 Jun 08	jari	157	}elsif(($param_hash{'exportMode'} eq "annotationStatistics") && ($param_hash{'annoForStat'} ne "NULL") ){
720	11 Jun 08	jari	158	my @base_annotations=split("\t",$base_hash{'AnnotationColumns'}); #all possible base annotations for this sample
720	11 Jun 08	jari	159
720	11 Jun 08	jari	160	### Fixing the issue with blanks ( :( )
720	11 Jun 08	jari	161	my $annos=$base_hash{'Assays'};
720	11 Jun 08	jari	162	$annos =~s/\t\t/\tNA\t/g;
720	11 Jun 08	jari	163	$annos =~s/\t\n/\tNA\n/g;
720	11 Jun 08	jari	164	$annos =~s/\t\t/\tNA\t/g;
720	11 Jun 08	jari	165	my @annos=split("\n",$annos); #a row is a sample with its annotations
720	11 Jun 08	jari	166
720	11 Jun 08	jari	167	#print "$base_hash{'Assays'}";
720	11 Jun 08	jari	168	#print "\n#####\n";
720	11 Jun 08	jari	169	#print "nbr assay $base_hash{'AnnotationColumns'}\n";
720	11 Jun 08	jari	170	#print "$annos";
720	11 Jun 08	jari	171
720	11 Jun 08	jari	172	### Find the subset of annotations to get statistics for
720	11 Jun 08	jari	173	my @annotation_index; #tracks which column index to use to get the annotation values
720	11 Jun 08	jari	174	for (my $i=0;$i<@base_annotations;$i++){
720	11 Jun 08	jari	175	foreach my $anno (@annoStat){
720	11 Jun 08	jari	176	if(($anno eq $base_annotations[$i]) && ($anno ne "")){ #found the right annotation!
720	11 Jun 08	jari	177	push(@annotation_index,$i);
720	11 Jun 08	jari	178	last;
720	11 Jun 08	jari	179	}
720	11 Jun 08	jari	180	}
720	11 Jun 08	jari	181	}
720	11 Jun 08	jari	182
720	11 Jun 08	jari	183	# @annoStat and @annotation_index are now matched!
720	11 Jun 08	jari	184	my %annoStat; #key = selected annotation type, value = the column of values for that type,
720	11 Jun 08	jari	185	# separated with tab.
720	11 Jun 08	jari	186	my %annoUnique; #key = selected annotation type, value = unique annotation values for that type
720	11 Jun 08	jari	187	for(my $i=0;$i<@annoStat;$i++){
720	11 Jun 08	jari	188	#foreach selected annotation extract its column
720	11 Jun 08	jari	189	my @tempCol;
720	11 Jun 08	jari	190	foreach my $line (@annos) {
720	11 Jun 08	jari	191	my @tmp=split("\t",$line);
720	11 Jun 08	jari	192	shift(@tmp); #removing assay id
720	11 Jun 08	jari	193	shift(@tmp); #removing assay name
720	11 Jun 08	jari	194	# only the annotations are left!
720	11 Jun 08	jari	195	push(@tempCol,$tmp[$annotation_index[$i]]);
720	11 Jun 08	jari	196	}
720	11 Jun 08	jari	197	$annoStat{$annoStat[$i]}=join("\t",@tempCol);
720	11 Jun 08	jari	198	my %saw;
720	11 Jun 08	jari	199	@saw{@tempCol}=();
720	11 Jun 08	jari	200	$annoUnique{$annoStat[$i]}=join("\t",keys(%saw));
720	11 Jun 08	jari	201	}
720	11 Jun 08	jari	202	open(myfile,">AnnotationStatistics.txt");
720	11 Jun 08	jari	203	foreach my $annotype (keys(%annoUnique)){
720	11 Jun 08	jari	204	#count how many instances of each unique annotation value there is for this annotation type
720	11 Jun 08	jari	205	my @uniqueV=split("\t",$annoUnique{$annotype});
720	11 Jun 08	jari	206	my @values=split("\t",$annoStat{$annotype});
720	11 Jun 08	jari	207	my %saw;
720	11 Jun 08	jari	208	print myfile "$annotype\tNbrAssays\n";
720	11 Jun 08	jari	209	foreach my $uniqueValue (@uniqueV){
720	11 Jun 08	jari	210	my $r=scalar(grep(/$uniqueValue/,@values));
720	11 Jun 08	jari	211	print myfile "$uniqueValue\t$r\n";
720	11 Jun 08	jari	212	}
720	11 Jun 08	jari	213	print myfile "\n";
720	11 Jun 08	jari	214	}
720	11 Jun 08	jari	215	close(myfile);
720	11 Jun 08	jari	216	$doExport="FALSE";
720	11 Jun 08	jari	217	}elsif($param_hash{'exportMode'} eq "annotationAll"){
720	11 Jun 08	jari	218	#create an annotation file for all annotations
720	11 Jun 08	jari	219
720	11 Jun 08	jari	220	my @base_annotations=split("\t",$base_hash{'AnnotationColumns'}); #all possible base annotations for this sample
720	11 Jun 08	jari	221
720	11 Jun 08	jari	222	### Fixing the issue with blanks ( :( )
720	11 Jun 08	jari	223	my $annos=$base_hash{'Assays'};
720	11 Jun 08	jari	224	$annos =~s/\t\t/\tNA\t/g;
720	11 Jun 08	jari	225	$annos =~s/\t\n/\tNA\n/g;
720	11 Jun 08	jari	226	$annos =~s/\t\t/\tNA\t/g;
720	11 Jun 08	jari	227	my @annos=split("\n",$annos); #a row is a sample with its annotations
720	11 Jun 08	jari	228
720	11 Jun 08	jari	229	open(myfile,">AnnotationAllFile.txt");
720	11 Jun 08	jari	230	print myfile "Assay\t$base_hash{'AnnotationColumns'}\n";
720	11 Jun 08	jari	231	foreach my $line (@annos){
720	11 Jun 08	jari	232	my @tmp=split("\t",$line);
720	11 Jun 08	jari	233	shift(@tmp); #remove the BASE assay id nummer
720	11 Jun 08	jari	234	$line=join("\t",@tmp);
720	11 Jun 08	jari	235	print myfile "$line\n";
720	11 Jun 08	jari	236	}
720	11 Jun 08	jari	237	close(myfile);
720	11 Jun 08	jari	238	$doExport="FALSE";
720	11 Jun 08	jari	239	}elsif(($param_hash{'exportMode'} eq "annotationSelected") && ($param_hash{'annoForStat'} ne "NULL")){
720	11 Jun 08	jari	240	#create an annotation file based on only selected annotations
720	11 Jun 08	jari	241
720	11 Jun 08	jari	242	my @base_annotations=split("\t",$base_hash{'AnnotationColumns'}); #all possible base annotations for this sample
720	11 Jun 08	jari	243
720	11 Jun 08	jari	244	### Fixing the issue with blanks ( :( )
720	11 Jun 08	jari	245	my $annos=$base_hash{'Assays'};
720	11 Jun 08	jari	246	$annos =~s/\t\t/\tNA\t/g;
720	11 Jun 08	jari	247	$annos =~s/\t\n/\tNA\n/g;
720	11 Jun 08	jari	248	$annos =~s/\t\t/\tNA\t/g;
720	11 Jun 08	jari	249	my @annos=split("\n",$annos); #a row is a sample with its annotations
720	11 Jun 08	jari	250
720	11 Jun 08	jari	251
720	11 Jun 08	jari	252	### Find the subset of annotations to get statistics for
720	11 Jun 08	jari	253	my @annotation_index; #tracks which column index to use to get the annotation values
720	11 Jun 08	jari	254	my @matched_annoStat;
720	11 Jun 08	jari	255
720	11 Jun 08	jari	256	foreach my $anno (@annoStat){
720	11 Jun 08	jari	257	if($anno =~/~/){
720	11 Jun 08	jari	258	# pattern matching!! Get all annotations that match the specified pattern
720	11 Jun 08	jari	259	# e.g. ~Her130 = all Her130 annotations
720	11 Jun 08	jari	260	$anno=~ /~(.+)/;
720	11 Jun 08	jari	261	my $pattern=$1;
720	11 Jun 08	jari	262	for(my $i=0;$i<@base_annotations;$i++){
720	11 Jun 08	jari	263	if($base_annotations[$i]=~/$pattern/){
720	11 Jun 08	jari	264	push(@matched_annoStat,$base_annotations[$i]);
720	11 Jun 08	jari	265	push(@annotation_index,$i);
720	11 Jun 08	jari	266	}
720	11 Jun 08	jari	267	}
720	11 Jun 08	jari	268	}else{
720	11 Jun 08	jari	269	for (my $i=0;$i<@base_annotations;$i++){
720	11 Jun 08	jari	270	if(($anno eq $base_annotations[$i]) && ($anno ne "")){ #found the right annotation!
720	11 Jun 08	jari	271	push(@matched_annoStat,$anno);
720	11 Jun 08	jari	272	push(@annotation_index,$i);
720	11 Jun 08	jari	273	last;
720	11 Jun 08	jari	274	}
720	11 Jun 08	jari	275	}
720	11 Jun 08	jari	276	}
720	11 Jun 08	jari	277	}
720	11 Jun 08	jari	278	# @annoStat and @annotation_index are now matched!
720	11 Jun 08	jari	279
720	11 Jun 08	jari	280	my $annoHeader=join("\t","Assay",@matched_annoStat);
720	11 Jun 08	jari	281	open(myfile,">AnnotationSelectedFile.txt");
720	11 Jun 08	jari	282	print myfile "$annoHeader\n";
720	11 Jun 08	jari	283	foreach my $line (@annos){
720	11 Jun 08	jari	284	my @tmp=split("\t",$line);
720	11 Jun 08	jari	285	my $printline=$tmp[1]; #assay name
720	11 Jun 08	jari	286	for(my $i=0;$i<@matched_annoStat;$i++){
720	11 Jun 08	jari	287	#print "$annotation_index[$i] $tmp[$annotation_index[$i]+2]\n";
720	11 Jun 08	jari	288	$printline=join("\t",$printline,$tmp[$annotation_index[$i]+2]);
720	11 Jun 08	jari	289	}
720	11 Jun 08	jari	290	print myfile "$printline\n";
720	11 Jun 08	jari	291	}
720	11 Jun 08	jari	292	close(myfile);
720	11 Jun 08	jari	293	$doExport="FALSE";
720	11 Jun 08	jari	294	}elsif(($param_hash{'exportMode'} eq "annotationDisplaySelected") && ($param_hash{'annoForStat'} ne "NULL")){
720	11 Jun 08	jari	295	#create an annotation display file based on only selected annotations
720	11 Jun 08	jari	296
720	11 Jun 08	jari	297	my @base_annotations=split("\t",$base_hash{'AnnotationColumns'}); #all possible base annotations for this sample
720	11 Jun 08	jari	298
720	11 Jun 08	jari	299	### Fixing the issue with blanks ( :( )
720	11 Jun 08	jari	300	my $annos=$base_hash{'Assays'};
720	11 Jun 08	jari	301	$annos =~s/\t\t/\tNA\t/g;
720	11 Jun 08	jari	302	$annos =~s/\t\n/\tNA\n/g;
720	11 Jun 08	jari	303	$annos =~s/\t\t/\tNA\t/g;
720	11 Jun 08	jari	304	my @annos=split("\n",$annos); #a row is a sample with its annotations
720	11 Jun 08	jari	305
720	11 Jun 08	jari	306
720	11 Jun 08	jari	307	### Find the subset of annotations to get statistics for
720	11 Jun 08	jari	308	my @annotation_index; #tracks which column index to use to get the annotation values
720	11 Jun 08	jari	309	my @matched_annoStat;
720	11 Jun 08	jari	310
720	11 Jun 08	jari	311	foreach my $anno (@annoStat){
720	11 Jun 08	jari	312	if($anno =~/~/){
720	11 Jun 08	jari	313	# pattern matching!! Get all annotations that match the specified pattern
720	11 Jun 08	jari	314	# e.g. ~Her130 = all Her130 annotations
720	11 Jun 08	jari	315	$anno=~ /~(.+)/;
720	11 Jun 08	jari	316	my $pattern=$1;
720	11 Jun 08	jari	317	for(my $i=0;$i<@base_annotations;$i++){
720	11 Jun 08	jari	318	if($base_annotations[$i]=~/$pattern/){
720	11 Jun 08	jari	319	push(@matched_annoStat,$base_annotations[$i]);
720	11 Jun 08	jari	320	push(@annotation_index,$i);
720	11 Jun 08	jari	321	}
720	11 Jun 08	jari	322	}
720	11 Jun 08	jari	323	}else{
720	11 Jun 08	jari	324	for (my $i=0;$i<@base_annotations;$i++){
720	11 Jun 08	jari	325	if(($anno eq $base_annotations[$i]) && ($anno ne "")){ #found the right annotation!
720	11 Jun 08	jari	326	push(@matched_annoStat,$anno);
720	11 Jun 08	jari	327	push(@annotation_index,$i);
720	11 Jun 08	jari	328	last;
720	11 Jun 08	jari	329	}
720	11 Jun 08	jari	330	}
720	11 Jun 08	jari	331	}
720	11 Jun 08	jari	332	}
720	11 Jun 08	jari	333	# @annoStat and @annotation_index are now matched!
720	11 Jun 08	jari	334	# i.e matched_annoStat contains all annotation types to extract data from,
720	11 Jun 08	jari	335	# and annotation_index contains which columns to use respectively.
720	11 Jun 08	jari	336
720	11 Jun 08	jari	337	# go through each matched_annoStat entry, find the unique annotation values (k)
720	11 Jun 08	jari	338	# and for each unique value, create a column with value 1 for match on the unique value
720	11 Jun 08	jari	339	#for a given sample, 0 for no match. Iterate this over all matched annotation types.
720	11 Jun 08	jari	340	# blank samples have the value NA in @annos. They retain NA in the export, however NA is not
720	11 Jun 08	jari	341	# allowed as a column type!
720	11 Jun 08	jari	342
720	11 Jun 08	jari	343	my @annotationDisplayMatrix; #N rows = N samples
720	11 Jun 08	jari	344	my @annotationDisplayHeader;
720	11 Jun 08	jari	345	push(@annotationDisplayHeader,"AssayName");
720	11 Jun 08	jari	346
720	11 Jun 08	jari	347	#Initiate the Display matrix with sample names
720	11 Jun 08	jari	348	for(my $j=0;$j<@annos;$j++){
720	11 Jun 08	jari	349	my @tmp=split("\t",$annos[$j]);
720	11 Jun 08	jari	350	push(@annotationDisplayMatrix,$tmp[1]); #initiate with sample name!
720	11 Jun 08	jari	351	}
720	11 Jun 08	jari	352
720	11 Jun 08	jari	353	for(my $i=0;$i<@matched_annoStat;$i++){
720	11 Jun 08	jari	354	my %uniqueValues;
720	11 Jun 08	jari	355	my @uniqueValueColumns;
720	11 Jun 08	jari	356
720	11 Jun 08	jari	357	#find unique annotation values
720	11 Jun 08	jari	358	for(my $j=0;$j<@annos;$j++){
720	11 Jun 08	jari	359	my @tmp=split("\t",$annos[$j]);
720	11 Jun 08	jari	360	if(($tmp[$annotation_index[$i]+2] eq "NA") \|\| ($tmp[$annotation_index[$i]+2] eq "")){
720	11 Jun 08	jari	361	#not allowed as a unique entry
720	11 Jun 08	jari	362	}else{
720	11 Jun 08	jari	363	$uniqueValues{$tmp[$annotation_index[$i]+2]}="";
720	11 Jun 08	jari	364	}
720	11 Jun 08	jari	365	}
720	11 Jun 08	jari	366	my @uniqueValueColumn=sort(keys(%uniqueValues));
720	11 Jun 08	jari	367
720	11 Jun 08	jari	368
720	11 Jun 08	jari	369	#foreach of the unique annotation values, make a column, 1 for present, 0 for absent
720	11 Jun 08	jari	370	# for each sample
720	11 Jun 08	jari	371	foreach my $uniqueValue (@uniqueValueColumn){
720	11 Jun 08	jari	372	#my $ttt<-$uniqueValue;
720	11 Jun 08	jari	373	#$ttt =~s/\s/\_/g;
720	11 Jun 08	jari	374	push(@annotationDisplayHeader,$uniqueValue); #push to the header
720	11 Jun 08	jari	375	for(my $j=0;$j<@annos;$j++){
720	11 Jun 08	jari	376	#for each sample
720	11 Jun 08	jari	377	my @tmp=split("\t",$annos[$j]);
720	11 Jun 08	jari	378
720	11 Jun 08	jari	379	if($tmp[$annotation_index[$i]+2] eq $uniqueValue){
720	11 Jun 08	jari	380	# push in a 1
720	11 Jun 08	jari	381	$annotationDisplayMatrix[$j]=$annotationDisplayMatrix[$j]."\t".1;
720	11 Jun 08	jari	382	}else{
720	11 Jun 08	jari	383	#push in a 0
720	11 Jun 08	jari	384	$annotationDisplayMatrix[$j]=$annotationDisplayMatrix[$j]."\t".0;
720	11 Jun 08	jari	385	}
720	11 Jun 08	jari	386	}
720	11 Jun 08	jari	387	}
720	11 Jun 08	jari	388	}
720	11 Jun 08	jari	389
720	11 Jun 08	jari	390	my $annoHeader= join("\t",@annotationDisplayHeader);
720	11 Jun 08	jari	391
720	11 Jun 08	jari	392	open(myfile,">AnnotationDisplaySelectedFile.txt");
720	11 Jun 08	jari	393	print myfile "$annoHeader\n";
720	11 Jun 08	jari	394	foreach my $row (@annotationDisplayMatrix){
720	11 Jun 08	jari	395	print myfile "$row\n";
720	11 Jun 08	jari	396
720	11 Jun 08	jari	397	}
720	11 Jun 08	jari	398	close(myfile);
720	11 Jun 08	jari	399	$doExport="FALSE";
720	11 Jun 08	jari	400	}#end annotationDisplay
720	11 Jun 08	jari	401
720	11 Jun 08	jari	402
720	11 Jun 08	jari	403	if($doExport eq "TRUE"){
720	11 Jun 08	jari	404	#### Determine how many common columns to add to get to assaydata
720	11 Jun 08	jari	405	my @cc = split("\t",$base_hash{'Columns'});
720	11 Jun 08	jari	406
720	11 Jun 08	jari	407	#### Make header and calculate nbr of data columns in total
720	11 Jun 08	jari	408	my $header=join("\t",@cc,@assaynames);
720	11 Jun 08	jari	409	my $nbrCols=scalar(@cc)+scalar(@assaynames);
720	11 Jun 08	jari	410
720	11 Jun 08	jari	411
720	11 Jun 08	jari	412	####### Parse the spot data one row at a time #######
720	11 Jun 08	jari	413	# each row looks like:
720	11 Jun 08	jari	414	# [reporterId] [geneSymbol] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M]
720	11 Jun 08	jari	415	my $chromosomeIndex=2;
720	11 Jun 08	jari	416	my $geneIndex=1;
720	11 Jun 08	jari	417	my $repIdIndex=0;
720	11 Jun 08	jari	418	my $endPositionIndex=5;
720	11 Jun 08	jari	419	my @highlightToWrite=();
720	11 Jun 08	jari	420	my @toSort=(); #vector for holding data to sort in format as above
720	11 Jun 08	jari	421
720	11 Jun 08	jari	422	$/="\n"; #reset
720	11 Jun 08	jari	423	my $found = "FALSE";
720	11 Jun 08	jari	424
720	11 Jun 08	jari	425
720	11 Jun 08	jari	426	## Create a file with the assay names for the stac_format.R script. Also modify the header for stac output
720	11 Jun 08	jari	427	## so that the header is sample1 .. sampleN instead of real assay names (column conflict in R for weird assay names
720	11 Jun 08	jari	428	if($param_hash{'exportMode'} eq "stac"){
720	11 Jun 08	jari	429	$header=join("\t",@cc);
720	11 Jun 08	jari	430	my $index=0;
720	11 Jun 08	jari	431	open(myoutfile,">assays.txt");
720	11 Jun 08	jari	432	foreach my $line (@assaynames){
720	11 Jun 08	jari	433	$index++;
720	11 Jun 08	jari	434	print myoutfile "$line\n";
720	11 Jun 08	jari	435	$header=$header."\t"."sample".$index;
720	11 Jun 08	jari	436	}
720	11 Jun 08	jari	437	close(myoutfile);
720	11 Jun 08	jari	438	}
720	11 Jun 08	jari	439
720	11 Jun 08	jari	440	if(($param_hash{'exportMode'} eq "standard_lite") \|\| ($param_hash{'exportMode'} eq "singleLite")){
720	11 Jun 08	jari	441	#lite format is reporterId,chromosome,startPosition,endPosition
720	11 Jun 08	jari	442	$header=join("\t","reporterId","chromosome","startPosition","endPosition",@assaynames);
720	11 Jun 08	jari	443	}
720	11 Jun 08	jari	444
720	11 Jun 08	jari	445	my $file="basematrix.txt";
720	11 Jun 08	jari	446	if($param_hash{'filename'} ne "NULL"){
720	11 Jun 08	jari	447	$file=$param_hash{'filename'};
720	11 Jun 08	jari	448	}
720	11 Jun 08	jari	449
720	11 Jun 08	jari	450	## open file for printing of data values
720	11 Jun 08	jari	451	if($param_hash{'exportMode'} ne "singleLite"){
720	11 Jun 08	jari	452	open (myoutfile,">$file");
720	11 Jun 08	jari	453	}
720	11 Jun 08	jari	454
720	11 Jun 08	jari	455	if(($param_hash{'exportMode'} ne "agilent") \|\| ($param_hash{'exportMode'} ne "bed") \|\| ($param_hash{'exportMode'} ne "singleLite") ){
720	11 Jun 08	jari	456	#print defined header if not Agilent mode, or BED or single lite
720	11 Jun 08	jari	457	print myoutfile "$header\n";
720	11 Jun 08	jari	458	}
720	11 Jun 08	jari	459
720	11 Jun 08	jari	460	while(<>){
720	11 Jun 08	jari	461	chomp;
720	11 Jun 08	jari	462	if($_ =~ /section spots/){ #found the spot section
720	11 Jun 08	jari	463	$found="TRUE";
720	11 Jun 08	jari	464	next; #skip to next line ?
720	11 Jun 08	jari	465	}
720	11 Jun 08	jari	466	if(($_ !~/columns/) \|\| ($_ !~/channels 2/) \|\| ($_ !~ /assayFields/) \|\| ($_ !~/assays/) \|\| ($_ !~/%/) && ($found eq "TRUE") ){
720	11 Jun 08	jari	467	# now reading real data line
720	11 Jun 08	jari	468	if ($_ eq ""){
720	11 Jun 08	jari	469	# do nada
720	11 Jun 08	jari	470	}else{
720	11 Jun 08	jari	471	# [reporterId] [geneSymbol] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M]
720	11 Jun 08	jari	472	my @temp = split("\t",$_); #split each line
720	11 Jun 08	jari	473	if( ($temp[$endPositionIndex]>0) && ($temp[$chromosomeIndex]=~ /\w/) && ($temp[$chromosomeIndex]>0) ){ #reporterId must have a bp end and a chromosome not = ""
720	11 Jun 08	jari	474	foreach my $row (@temp){ #replace empty string with NaN ..
720	11 Jun 08	jari	475	if($row eq ""){
720	11 Jun 08	jari	476	$row = "NaN";
720	11 Jun 08	jari	477	}
720	11 Jun 08	jari	478	}
720	11 Jun 08	jari	479	if(scalar(@temp)<$nbrCols){ # To cope with dropped tabs..
720	11 Jun 08	jari	480	while(scalar(@temp)<$nbrCols){
720	11 Jun 08	jari	481	push(@temp,"NaN");
720	11 Jun 08	jari	482	}
720	11 Jun 08	jari	483	}
720	11 Jun 08	jari	484	my $line;
720	11 Jun 08	jari	485	if(($param_hash{'exportMode'} eq "standard_lite") \|\| ($param_hash{'exportMode'} eq "singleLite")){
720	11 Jun 08	jari	486	splice(@temp, 1, 1);
720	11 Jun 08	jari	487	#new format of temp
720	11 Jun 08	jari	488	# [reporterId] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M]
720	11 Jun 08	jari	489	splice(@temp, 2, 1);
720	11 Jun 08	jari	490	#new format of temp
720	11 Jun 08	jari	491	# [reporterId] [chromosome] [startPosition] [endPosition] [assayData = M]
720	11 Jun 08	jari	492	$line = join("\t",@temp);
720	11 Jun 08	jari	493	}elsif($param_hash{'exportMode'} eq "bed"){
720	11 Jun 08	jari	494	#format: chr start stop reporterId
720	11 Jun 08	jari	495	my $bed_chrom="chr".$temp[$chromosomeIndex];
720	11 Jun 08	jari	496	if($temp[$chromosomeIndex] eq "23"){
720	11 Jun 08	jari	497	$bed_chrom="chrX";
720	11 Jun 08	jari	498	}
720	11 Jun 08	jari	499	if($temp[$chromosomeIndex] eq "24"){
720	11 Jun 08	jari	500	$bed_chrom="chrY";
720	11 Jun 08	jari	501	}
720	11 Jun 08	jari	502	$line = join("\t",$bed_chrom,$temp[4],$temp[$endPositionIndex],$temp[$repIdIndex]);
720	11 Jun 08	jari	503	}else{
720	11 Jun 08	jari	504	$line = join("\t",@temp);
720	11 Jun 08	jari	505	}
720	11 Jun 08	jari	506
720	11 Jun 08	jari	507	if(($param_hash{'sort'} eq "TRUE") \|\| ($param_hash{'exportMode'} eq "singleLite") \|\| ($param_hash{'exportMode'} eq "agilent") \|\| ($param_hash{'exportMode'} eq "stac") ){
720	11 Jun 08	jari	508	push(@toSort,$line);
720	11 Jun 08	jari	509	}else{
720	11 Jun 08	jari	510	print myoutfile "$line\n";
720	11 Jun 08	jari	511	}
720	11 Jun 08	jari	512	}
720	11 Jun 08	jari	513	}
720	11 Jun 08	jari	514	}# end of if ( !~/columns etc..
720	11 Jun 08	jari	515	} #end of while(<>)
720	11 Jun 08	jari	516
720	11 Jun 08	jari	517	if($param_hash{'exportMode'} eq "stac"){
720	11 Jun 08	jari	518	#to format the data correctly and easiest, use a small R script, stac_format.R
720	11 Jun 08	jari	519	open(myfile,"<$FindBin::Bin/stac_format.R");
720	11 Jun 08	jari	520	open (myoutfile, ">stac_formatCommands.R");
720	11 Jun 08	jari	521	while(<myfile>){
720	11 Jun 08	jari	522	chomp;
720	11 Jun 08	jari	523	print myoutfile "$_\n";
720	11 Jun 08	jari	524	}
720	11 Jun 08	jari	525	close (myfile);
720	11 Jun 08	jari	526	close (myoutfile);
720	11 Jun 08	jari	527
720	11 Jun 08	jari	528	my $success = system("R --no-save < stac_formatCommands.R > r-debug.txt");
720	11 Jun 08	jari	529
720	11 Jun 08	jari	530	#catch the output
720	11 Jun 08	jari	531
720	11 Jun 08	jari	532	}
720	11 Jun 08	jari	533
720	11 Jun 08	jari	534	## Sort data and print it if selected
720	11 Jun 08	jari	535	if(($param_hash{'sort'} eq "TRUE") \|\| ($param_hash{'exportMode'} eq "agilent") \|\| ($param_hash{'exportMode'} eq "singleLite")){
720	11 Jun 08	jari	536	#sort on chromosome and then on startPosition
720	11 Jun 08	jari	537	if($param_hash{'sort'} eq "TRUE"){
720	11 Jun 08	jari	538	if(($param_hash{'exportMode'} eq "standard_lite") \|\| ($param_hash{'exportMode'} eq "singleLite")){
720	11 Jun 08	jari	539	# [reporterId] [chromosome] [startPosition] [endPosition] [assayData = M]
720	11 Jun 08	jari	540	@toSort =
720	11 Jun 08	jari	541	map $_->[0] =>
720	11 Jun 08	jari	542	sort { $a->[1] <=> $b->[1]
720	11 Jun 08	jari	543	\|\|
720	11 Jun 08	jari	544	$a->[2] <=> $b->[2] }
720	11 Jun 08	jari	545	map [ $_, (split /\t/)[1], (split /\t/)[2] ]
720	11 Jun 08	jari	546	#map [ $_, (split /\t/)[1,2] ]
720	11 Jun 08	jari	547	=> @toSort;
720	11 Jun 08	jari	548	}elsif($param_hash{'exportMode'} eq "bed"){
720	11 Jun 08	jari	549	#format: [chromosome] [startPosition] [stopPosition] [reporterId]
720	11 Jun 08	jari	550	@toSort =
720	11 Jun 08	jari	551	map $_->[0] =>
720	11 Jun 08	jari	552	sort { $a->[1] <=> $b->[1]
720	11 Jun 08	jari	553	\|\|
720	11 Jun 08	jari	554	$a->[2] <=> $b->[2] }
720	11 Jun 08	jari	555	map [ $_, (split /\t/)[0], (split /\t/)[1] ]
720	11 Jun 08	jari	556	=> @toSort;
720	11 Jun 08	jari	557	}else{
720	11 Jun 08	jari	558	#standard format
720	11 Jun 08	jari	559	# [reporterId] [geneSymbol] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M]
720	11 Jun 08	jari	560
720	11 Jun 08	jari	561	@toSort =
720	11 Jun 08	jari	562	map $_->[0] =>
720	11 Jun 08	jari	563	sort { $a->[1] <=> $b->[1] \|\|
720	11 Jun 08	jari	564	$a->[2] <=> $b->[2] }
720	11 Jun 08	jari	565	map [ $_, (split /\t/)[2], (split /\t/)[4] ]
720	11 Jun 08	jari	566	=> @toSort;
720	11 Jun 08	jari	567	}
720	11 Jun 08	jari	568	}#end if to sort data
720	11 Jun 08	jari	569	if($param_hash{'exportMode'} eq "singleLite"){
720	11 Jun 08	jari	570	# split the matrix into separate files...
720	11 Jun 08	jari	571	# [reporterId] [chromosome] [startPosition] [endPosition] [assayData = M]
720	11 Jun 08	jari	572
720	11 Jun 08	jari	573	#split header
720	11 Jun 08	jari	574	my @tmpHeader=split("\t",$header);
720	11 Jun 08	jari	575	my $commonHeader=join("\t",shift(@tmpHeader),shift(@tmpHeader),shift(@tmpHeader),shift(@tmpHeader));
720	11 Jun 08	jari	576	my $ccN=4;
720	11 Jun 08	jari	577	for(my $i=0;$i<@tmpHeader;$i++){
720	11 Jun 08	jari	578	#foreach assay create individual files, through iteration.. VERY SLOW!
720	11 Jun 08	jari	579	my $fileN=$tmpHeader[$i]."_".$file;
720	11 Jun 08	jari	580	open(mySingleFileHandle,">$fileN");
720	11 Jun 08	jari	581	print mySingleFileHandle "$commonHeader\t$tmpHeader[$i]\n";
720	11 Jun 08	jari	582	for(my $j=0;$j<@toSort;$j++){
720	11 Jun 08	jari	583	my @ttt= split("\t",$toSort[$j]);
720	11 Jun 08	jari	584	my $lineN=join("\t",$ttt[0],$ttt[1],$ttt[2],$ttt[3],$ttt[$i+$ccN]);
720	11 Jun 08	jari	585	print mySingleFileHandle "$lineN\n";
720	11 Jun 08	jari	586	}
720	11 Jun 08	jari	587	close(mySingleFileHandle);
720	11 Jun 08	jari	588	}
720	11 Jun 08	jari	589	}elsif($param_hash{'exportMode'} eq "agilent"){
720	11 Jun 08	jari	590	#create the agilent format and print
720	11 Jun 08	jari	591	my $headerA=join("\t","FeatureNum","Chr","Start","Stop","SystematicName","Gene Name","Description",@assaynames);
720	11 Jun 08	jari	592	my $featureNum=0;
720	11 Jun 08	jari	593	print myoutfile "AMADID 1234567\n"; #check if to use random number?
720	11 Jun 08	jari	594	print myoutfile "$headerA\n";
720	11 Jun 08	jari	595
720	11 Jun 08	jari	596	# Format of toSort:
720	11 Jun 08	jari	597	# [reporterId] [geneSymbol] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M]
720	11 Jun 08	jari	598	foreach my $line (@toSort){
720	11 Jun 08	jari	599	$featureNum++;
720	11 Jun 08	jari	600	my @tmp=split("\t",$line);
720	11 Jun 08	jari	601	if($tmp[2] eq "23"){
720	11 Jun 08	jari	602	$tmp[2]="X";
720	11 Jun 08	jari	603	}elsif($tmp[2] eq "24"){
720	11 Jun 08	jari	604	$tmp[2]="Y";
720	11 Jun 08	jari	605	}
720	11 Jun 08	jari	606	#my $agilentLine=join("\t",$featureNum,$tmp[2],$tmp[4],$tmp[5],$tmp[0],$tmp[1],""); #if it works with bundled gene symbols..
720	11 Jun 08	jari	607	my $agilentLine=join("\t",$featureNum,$tmp[2],$tmp[4],$tmp[5],$tmp[0],"","");
720	11 Jun 08	jari	608
720	11 Jun 08	jari	609	splice(@tmp,0,6); #only log2data left
720	11 Jun 08	jari	610	$agilentLine=join("\t",$agilentLine,@tmp);
720	11 Jun 08	jari	611	print myoutfile "$agilentLine\n";
720	11 Jun 08	jari	612	}
720	11 Jun 08	jari	613	}else{
720	11 Jun 08	jari	614	#just print!
720	11 Jun 08	jari	615	foreach my $line (@toSort){
720	11 Jun 08	jari	616	print myoutfile "$line\n";
720	11 Jun 08	jari	617	}
720	11 Jun 08	jari	618	}
720	11 Jun 08	jari	619	}
720	11 Jun 08	jari	620	if($param_hash{'exportMode'} ne "singleLite"){
720	11 Jun 08	jari	621	close(myoutfile);
720	11 Jun 08	jari	622	}
720	11 Jun 08	jari	623	}#end doExport
720	11 Jun 08	jari	624