標籤:

[Perl]多線程抓取外匯牌價

Github: vicyang/Exchange-Rates

注意這個代碼是通過傳參的方式使用的,舉個栗子:

GetExchangeData.pl 2016-01-01 2017-01-01 2017.perldb

單線程/多線程效率對比:

同樣的網路環境,原來獲取2017年1月的匯率記錄需要30-40秒左右,現在只需要 8 秒。

有空會做一個可視化互動查看的程序,之前做的太粗陋(封面圖)

=info 獲取中行外匯牌價-美元欄目的信息 Auth: 523066680 Date: 2017-10 https://github.com/vicyang/Exchange-Rates=cutuse Encode;use threads;use threads::shared;use Try::Tiny;use Time::HiRes qw/sleep/;use Time::Local;use File::Slurp;use Data::Dump qw/dump/;use Data::Dumper;use LWP::UserAgent;use HTML::TableExtract;use IO::Handle;STDOUT->autoflush(1);$Data::Dumper::Indent = 0;$Data::Dumper::Sortkeys = 1;our $URL = "http://srh.bankofchina.com/search/whpj/search.jsp";our $ua = LWP::UserAgent->new( timeout => 5, keep_alive => 1, agent => "Mozilla/5.0", );our $hash :shared;our @task :shared;$hash = shared_clone( {} );if ( check_arguments() == 0 ) { printf "Wrong format
"
; exit; }my ($from, $to, $file) = @ARGV;# my $file = "2015.perldb";# my $from = "2015-01-01";# my $to = "2016-01-01";my $time_a = Time::HiRes::time();my $pageid = 1;my @ths;grep { push @ths, threads->create( &func, $from, $to, $_ ) } ( 0 .. 5 );#循環插入任務,等待線程結束while ( threads->list( threads::running ) ) { grep { $task[$_] = $pageid++ if ( $task[$_] == 0 ); } (0..5);}#分離線程grep { $_->detach() } @ths;printf("Dumping ... ");my $dbstr = Dumper($hash);$dbstr =~s/(],)/$1
/g
;$dbstr =~s/(},)/$1
/g
;$dbstr =~s/(=> {)/$1
/g
;$dbstr =~s/(= {)/$1
/g
;write_file( $file, { binmode => ":raw" }, $dbstr );printf("Done
Time used: %.3f
"
, Time::HiRes::time() - $time_a );sub func{ my ($from, $to, $idx) = @_; my $content; my $timestamp; while (1) { if ( $task[$idx] == 0 ) { sleep 0.01; next; } $content = get_page( $from, $to, $task[$idx] ); #如果獲取信息失敗,重新get_page unless ($content =~/var m_nCurrPage = (d+)/) { printf "[%d] Try again: %4d <-
"
, $idx, $task[$idx]; next; } #如果頁碼和任務頁碼不匹配,結束任務 last if ( $1 != $task[$idx] ); $timestamp = get_exchange_data( $content ); #如果該頁沒有任何有效信息,結束任務 last if ( not defined $timestamp ); printf "[%d] mission: %4d time: %s
"
, $idx, $task[$idx], $timestamp; #歸零 $task[$idx] = 0; }}sub get_exchange_data{ my ( $html_str ) = @_; my ($obj, $table, $timestamp, $date, $time); #count => 1 表示選擇第二個表格 $obj = HTML::TableExtract->new( depth => 0, count => 1 ); $obj->parse($html_str); grep { $table = $_ } $obj->tables; for my $ele ( $table->rows ) { shift @$ele; "去掉第一行抬頭"; next if ( $ele->[1] eq "" ); "去掉第一列貨幣類型"; next if ( not $ele->[1] =~/d/ ); "表格最後一行為空"; $timestamp = pop @$ele; $timestamp =~/^(.{10}) (.{8})/; ($date, $time) = ($1, $2); if (not exists $hash->{$date}) { $hash->{$date} = shared_clone( {} ) } $hash->{$date}{$time} = shared_clone( [@$ele] ); #push @{$hash->{$date}}, shared_clone({$timestamp, join(" ", @$row)}); } return $timestamp;}sub get_page{ our $ua; my ($from, $to, $pageid) = @_; my $res; $res = $ua->post( $URL, [ erectDate => $from, nothing => $to, pjname => "1316", page => $pageid ] ); return $res->content();}sub time_to_date{ my ($sec, $min, $hour, $day, $mon, $year) = localtime( shift ); $mon += 1; $year += 1900; return sprintf "%d-%02d-%02d", $year,$mon,$day;}sub check_arguments{ if ( $#ARGV < 2 ) { printf "Too few arguments
"
; exit; } my ( $from, $to, $file) = @ARGV; my $today = time_to_date( time() ); my $res = 1; for my $dt ( $from, $to ) { if ( $dt =~/(d{4})-(d{2})-(d{2})/ ) { $res = 0 if ( $dt gt $today ); $res = 0 if ( $1 lt 2007 ); try { timelocal(0,0,0, $3, $2-1, $1-1900) } catch { $res = 0 }; } } $res = 0 if ( $from gt $to ); return $res;}

推薦閱讀:

TAG:Perl | 爬虫 | 数据 |