我的代碼循環遍歷目錄中的多個文件,解析每個文件並將每個文件的解析內容附加到FinalVariantfile.txt
。用於每個循環的重複輸出
該代碼有效,但會複製每個文件的內容。
當我用兩個文件運行代碼時,輸出包含4個文件。有人可以解釋爲什麼會發生這種情況,以及如何解決這個問題?
#!/usr/bin/perl -w
use strict;
#directory structure
my $home = "/data/";
my $tsvdirectory = $home . "test_all_runs/" . $ARGV[0];
my $tsvfiles = $home . "test_all_runs/" . $ARGV[0] . "/tsv_files.txt";
my $FinalVariants = $home . "test_all_runs/" . $ARGV[0] . "/FinalVariantfile.txt";
my @tsvfiles =();
my @currentlines =();
my $currentline = '';
my $currentCNVline = '';
my @currentCNVlines =();
my @HotSpotLines =();
my @CNVLines =();
# command to produce the vcf_files.txt file stored in each individual run
# directory; the file list includes solely vcf files which have not been
# previously prepared and/or annotated
my $cmd = `ls $tsvdirectory/FOCUS*\.tsv > $tsvfiles`;
# print "$cmd";
my $cmda = "ls $tsvdirectory/FOCUS*\.tsv > $tsvfiles";
# print "$cmda";
# this code opens the vcf_files.txt file and passes each line into an array for
# indidivudal manipulation
open(TXT2, "$tsvfiles");
while (<TXT2>) {
push(@tsvfiles, $_);
}
close(TXT2);
foreach (@tsvfiles) {
chop($_);
}
# this code then parses each of the files listed by name in the tsvfiles array
foreach (@tsvfiles) {
my $currenttsvfile = "$_"; # establishes the current file being manipulated
my $MDLfinaltsvfile = $currenttsvfile;
$MDLfinaltsvfile =~ s/\.tsv/_prepared\.txt/g;
# this series of variable calls names the various intermediate or
# final output files
my $MDLlinestsvfile = $currenttsvfile;
$MDLlinestsvfile =~ s/\.tsv/_withCNV\.txt/g;
my $Variantlinestsvfile = $currenttsvfile;
$Variantlinestsvfile =~ s/\.tsv/_HotSpot\.txt/g;
my $MDLtsvfile = $currenttsvfile;
$MDLtsvfile =~ s/\.tsv/_FilteredAllcolumns\.txt/g;
my $MDLsampleid = $currenttsvfile;
$MDLsampleid =~ s/\-oncogene.tsv//g;
print "The currentVCFis############# " . $currenttsvfile . "\n";
my @SampleID =();
@SampleID = split /\//, $MDLsampleid;
print "The sampleIDis##############" . $SampleID[4] . "\n";
my $CNVdata = $currenttsvfile;
$CNVdata =~ s/\.tsv/_cnv\.txt/g;
my $FinalCNVdata = $currenttsvfile;
$FinalCNVdata =~ s/\.tsv/_finalcnv\.txt/g;
my $cmd2 = `fgrep -v "#" $currenttsvfile > $MDLlinestsvfile`;
print "$cmd2"; # this code extracts from the current vcf file all of the
# lines of data and outputs them into a separate file
my $cmd5 = `grep -vwE "(CNV|intronic|synonymous|utr_3|utr_5)"
#removes lines that contain CNV/intronic/synonymous/utr_3/utr_5"
$MDLlinestsvfile > $Variantlinestsvfile`;
print "$cmd5";
open(my $fh_in, '<', $Variantlinestsvfile)
or die "cannot open $Variantlinestsvfile: $!\n";
#removes lines that contain 0/0 and ./. genotypes from field 70.
open(my $fh_out, '>', $MDLtsvfile)
or die "cannot open $MDLtsvfile: $!\n";
while (my $line = <$fh_in>) {
# tab/field-based:
my @fields = split(/\s+/, $line);
print $fh_out $line unless ($fields[70] =~ m|([0.])/\1|);
}
close($fh_in);
close($fh_out);
#open each filtered file with all columns and pushes it into array.
open(TXT2, "$MDLtsvfile");
while (<TXT2>) {
push(@HotSpotLines, $_);
}
close(TXT2);
foreach (@HotSpotLines) {
chop($_);
my @HotSpotEntries =();
my $currentMDLline = $_;
@HotSpotEntries = split(/\t/, $currentMDLline);
my $chr = $HotSpotEntries[9];
my $position = $HotSpotEntries[10];
my $cosmicids = $HotSpotEntries[21];
my $refforward = $HotSpotEntries[67];
my $genotype = $HotSpotEntries[70];
my $altforward = $HotSpotEntries[77];
my $altreverse = $HotSpotEntries[78];
my $cDNA = $HotSpotEntries[81];
my $exon = $HotSpotEntries[83];
my $conseq = $HotSpotEntries[84];
my $location = $HotSpotEntries[88];
my $geneclass = $HotSpotEntries[92];
my $aachange = $HotSpotEntries[98];
my $transcript = $HotSpotEntries[100];
$currentline
= $SampleID[4] . "\t"
. $chr . "\t"
. $position . "\t"
. $cosmicids . "\t"
. $refforward . "\t"
. $refreverse . "\t"
. $genotype . "\t"
. $altforward . "\t"
. $altreverse . "\t"
. $cDNA . "\t"
. $exon . "\t"
. $conseq . "\t"
. $location . "\t"
. $geneclass . "\t"
. $aachange . "\t"
. $transcript;
# print "The currentVCFlineis ".$currentline."\n";
push(@currentlines, $currentline);
}
my $i;
for ($i = 0; $i < @currentlines; $i += 1) {
my $currentguiline = $currentlines[$i];
my $cmd5 = `echo "$currentguiline" >> $FinalVariants`;
print "$cmd5";
#my $cmd9 = `sed -i '1i$SampleID[4]' $FinalVariants`; print $cmd9;
}
}
顯示的'@ tsvfiles'內容的'ls'命令後。這段代碼似乎過於複雜,並且很難遵循它似乎在做的事情。 –
'ls'列出tsv文件的路徑並將其輸出到tsv_files.txt中。每個文件只列出一次,因此,我不明白爲什麼它以某種方式遍歷每個文件兩次?我所做的只是列出感興趣的文件的路徑,循環遍歷每個文件,解析並轉儲到FinalVariantfile.txt中。我很開心學習完成相同的另一種方式,但是,由於我是新手,所以我試着說離開comlex正則表達式。 – user3781528