perl处理(遍历)较大(行数较多)文件的方法

发布时间:2020-09-16编辑:脚本学堂
perl在处理文件方面的能力还是很强的,处理行数较少的文件直接循环遍历就可以了,对于较大的文件则有些吃力。
不过可以借助perl的一些模块来实现,大致解决方案分为三种:

perl在处理文件方面的能力还是很强的,处理行数较少的文件直接循环遍历就可以了,对于较大的文件则有些吃力。
不过可以借助perl的一些模块来实现,大致解决方案分为三种:DB_File、Tie::File(Tie:File:AsHash)、然后就是DBI(先将数据导入数据库,然后从数据库中读取文件)。

本人认为用数据库存取数据时比较理想的方法,先来看如下的示例。
 

复制代码 代码如下:

#!/usr/bin/perl -w
use strict;
use DBI;

die "n",&die(),"n" unless @ARGV==7;
my ($tissue,$cpg_forw,$cpg_rev,$chg_forw,$chg_rev,$chh_forw,$chh_rev)=@ARGV;
my ($driver,$dsn,$root,$psword)=("mysql","database=methylome","root","123456");
my $dbh=DBI->connect("dbi:$driver:$dsn",$root,$psword) or die "$DBI::errstr";
   $dbh->do(qq(create table forw_CpG_$tissue(chrom VARCHAR(11),pos1 INT,pos2 INT,depth INT,meth_lev float)));
   $dbh->do(qq(load data local infile '$cpg_forw' into table forw_CpG_$tissue));
  
   $dbh->do(qq(create table rev_CpG_$tissue(chrom VARCHAR(11),pos1 INT,pos2 INT,depth INT,meth_lev float)));
   $dbh->do(qq(load data local infile '$cpg_rev' into table rev_CpG_$tissue));

 
   $dbh->do(qq(create table forw_CHG_$tissue(chrom VARCHAR(11),pos1 INT,pos2 INT,depth INT,meth_lev float)));
   $dbh->do(qq(load data local infile '$chg_forw' into table forw_CHG_$tissue));

   $dbh->do(qq(create table rev_CHG_$tissue(chrom VARCHAR(11),pos1 INT,pos2 INT,depth INT,meth_lev float)));
   $dbh->do(qq(load data local infile '$chg_rev' into table rev_CHG_$tissue));

   $dbh->do(qq(create table forw_CHH_$tissue(chrom VARCHAR(11),pos1 INT,pos2 INT,depth INT,meth_lev float)));
   $dbh->do(qq(load data local infile '$chh_forw' into table forw_CHH_$tissue));

   $dbh->do(qq(create table rev_CHH_$tissue(chrom VARCHAR(11),pos1 INT,pos2 INT,depth INT,meth_lev float)));
   $dbh->do(qq(load data local infile '$chh_rev' into table rev_CHH_$tissue));

$dbh->disconnect();

sub die{
    my $die=<<DIE;
perl *.pl <Tissue_name> <forw_CpG> <rev_CpG> <Forw_CHG> <Rev_CHG> <Forw_CHH> <Rev_CHH>
DIE
    return $die;
}

#--------------------------
 

复制代码 代码如下:

#!/usr/bin/perl -w
use strict;use DBI;
my $startTime=localtime();
die "n",&die(),"n" unless @ARGV==4;
my ($forw,$rev,$ele,$out)=@ARGV;

select STDOUT;
$|=1;
my ($driver,$dsn,$user,$psword)=("mysql","database=methylome","root","123456");
my $dbh=DBI->connect("dbi:$driver:$dsn",$user,$psword);
print "Connected!n";
my %meth_forw;my %nufor;my $meth_rev;my $nurev;my $flag=1;
open ELE,$ele or die;
while(my $line=<ELE>){
    chomp $line;
    print "$flag have done!!!n" if $flag00==0;$flag++;
    my ($chr,$element,$nu1,$nu2,$stran)=(split(/s+/,$line))[0,2,3,4,6];  # chromosome,gene element,eadge1,eadge2,strand
    &aa($element,$nu1,$nu2,$chr,$stran);
}

open OUT,"+>$out" or die;
my @sort=sort (keys %meth_forw);
foreach(@sort){
    my $methsta=$meth_forw{$_}/$nufor{$_};
    print OUT "$_t$methstat$meth_forw{$_}t$nufor{$_}n";
}
my $endTime=localtime();
print OUT "$startTimet$endTimen";
close OUT;
sub aa{
    my $strd=pop @_;          # strand
    my $chrs=pop @_;          #chromosome
    my $elem=shift @_;        #element
    my $flag=0;
  
    #positive strand
    foreach my $seg(@_){
        my $tem=$dbh->prepare(qq(select * from $forw where chrom='$chrs' and pos1>=$seg-100 and pos1<=$seg+100));
           $tem->execute();
        my ($chr,$pos1,$pos2,$depth,$meth_lev);
           $tem->bind_columns($chr,$pos1,$pos2,$depth,$meth_lev);
        while($tem->fetch()){
           next if $depth<4;my $pkey;
           if($strd eq "+"){
               next if !$pos1;
               $pkey=$pos1-$seg;
               $nufor{"forw$elem.$flag.$pkey"}++;
               $meth_forw{"forw$elem.$flag.$pkey"}+=$meth_lev;
           }else{
               next if !$pos1;
               $pkey=$seg-$pos1;
               my $flag_tem=0;
               $flag_tem=1 if $flag==0;
               $nufor{"forw$elem.$flag_tem.$pkey"}++;
               $meth_forw{"forw$elem.$flag_tem.$pkey"}+=$meth_lev;
           }
        }
    ++$flag;          # used to remind which eadge
    }
 
    #reverse stands
    foreach my $seg(@_){
        my $tem=$dbh->prepare(qq(select * from $rev where chrom='$chrs' and pos1>=$seg-100 and pos1<=$seg+100));
           $tem->execute();
        my ($chr,$pos1,$pos2,$depth,$meth_lev);
           $tem->bind_columns($chr,$pos1,$pos2,$depth,$meth_lev);
        while($tem->fetch()){
           next if $depth<4;my $pkey;
           if($strd eq "+"){
               next if !$pos1;
               $pkey=$pos1-$seg;
               $nufor{"rev$elem.$flag.$pkey"}++;
               $meth_forw{"rev$elem.$flag.$pkey"}+=$meth_lev;
           }else{
               next if !$pos1;
               $pkey=$seg-$pos1;
               my $flag_tem=0;
               $flag_tem=1 if $flag==0;
               $nufor{"rev$elem.$flag_tem.$pkey"}++;
               $meth_forw{"rev$elem.$flag_tem.$pkey"}+=$meth_lev;
           }
        }
    ++$flag;          # used to remind which eadge
    }

}
$dbh->disconnect();
sub die{
    my $die=<<DIE;
perl *.pl <Forward_table> <Reverse_table> <Element>  <OUT>
The data has been loaded!
DIE
    return $die;
}