#!/usr/local/bin/perl # データ抽出 open DATA, 'index.html'; # 処理対象ファイル @data = ; foreach (@data) { s/\n//g; s/\t//g; s/\r//g; $data .= $_; } @h1 = split /

out3.dat'; foreach $h1 (@h1) { $h1 = '(.*)<\/h1>//) { $date = $1; ++$kousin; $date =~ /\-(..)(.*)/; $day = $1; $time = $2; ++$total_day if $day ne $last_day; $last_day = $day; @data2 = split /[^\(](.*?)<\/a>//) { $source_url = $1; $source_site = $2; $source_site_list{ $2 } = $1; ++$k; $j = $i; while ($data2 =~ s/(.*?)(.*?)<\/a>//) { $news->[$i]{ number } = $k; $news->[$i]{ date } = $date; $news->[$i]{ day } = $day; $news->[$i]{ time } = $time; $news->[$i]{ source_url } = $source_url; $news->[$i]{ source_site } = $source_site; $news->[$i]{ title } = $1; $news->[$i]{ url } = $2; $news->[$i]{ site } = $3; $news->[$i]{ title } =~ s/
//g; while ($news->[$i]{ title } =~ s/―@$//) { ; } $news->[$i]{ title } =~ s/[$i]{ title } =~ s/>/>/g; ++$i; } if ($i == $j) { $data2 =~s/
//g; $data2 =~ s/
[$i]{ title } = $data2; $news->[$i]{ number } = $k; $news->[$i]{ date } = $date; $news->[$i]{ day } = $day; $news->[$i]{ time } = $time; $news->[$i]{ source_url } = $source_url; $news->[$i]{ source_site } = $source_site; ++$i; $data2 = ''; } print OUT3 "$data2\n"; } } } } # データ集計 open OUT, '>out.dat'; $k = 0; while ($k < $i) { ++$count{ $news->[$k]{ source_site } }; if ($last_time{ $news->[$k]{ source_site } } ne $news->[$k]{ date }) { ++$count_time{ $news->[$k]{ source_site } }; ++$total_by_time; if ($last_day{ $news->[$k]{ source_site } } ne $news->[$k]{ day }) { ++$count_day{ $news->[$k]{ source_site } }; ++$total_by_day; $last_day{ $news->[$k]{ source_site } } = $news->[$k]{ day }; } $last_time{ $news->[$k]{ source_site } } = $news->[$k]{ date }; } print OUT "$news->[$k]{ date }\t$news->[$k]{ day }\t$news->[$k]{ time }\t$news->[$k]{ title }\t$news->[$k]{ url }\t$news->[$k]{ source_site }\t$news->[$k]{ source_url }\n"; ++$k; } open OUT2, '>out2.dat'; undef %for_sort; foreach (keys %source_site_list) { $percent = $count{ $_ }*100/($i+1); $percent =~ s/^(....).*/\1/; $for_sort{ "$_\t$count{ $_ }\t$percent %\n" } = $count{ $_ }; } @sorted_list = sort { $for_sort{ $b } <=> $for_sort{ $a } } keys %for_sort; print OUT2 "記事単位で集計\n"; foreach (@sorted_list) { print OUT2 "$_"; } print OUT2 "total\t$i\t100%\n\n"; undef %for_sort; foreach (keys %source_site_list) { $percent = $count_time{ $_ }*100/$kousin; $percent =~ s/^(....).*/\1/; $for_sort{ "$_\t$count_time{ $_ }\t$percent %\n" } = $count_time{ $_ }; } @sorted_list = sort { $for_sort{ $b } <=> $for_sort{ $a } } keys %for_sort; print OUT2 "更新単位で集計(更新回数:$kousin 回)\t\t更新時に取り上げられる確率\n"; foreach (@sorted_list) { print OUT2 "$_"; } $k = $i+1; print OUT2 "total\t$total_by_time\n\n"; undef %for_sort; foreach (keys %source_site_list) { $percent = $count_day{ $_ }*100/$total_day; $percent =~ s/^(....).*/\1/; $for_sort{ "$_\t$count_day{ $_ }\t$percent %\n" } = $count_day{ $_ }; } @sorted_list = sort { $for_sort{ $b } <=> $for_sort{ $a } } keys %for_sort; print OUT2 "日付単位で集計($total_day日)\t\t1日あたりに取り上げられる確率\n"; foreach (@sorted_list) { print OUT2 "$_"; } $k = $i+1; print OUT2 "total\t$total_by_day\n\n";