#!/usr/bin/perl
use strict;
use warnings;
use Encode qw/decode/;
use Encode qw/encode/;
use utf8;

# perl get_matrix_by_series.pl あらすじデータ シリーズ名データ count_noun_with_sub.pl出力ファイル get_noun_list.pl出力ファイル 無視する単語の頻度数 > 出力ファイル

# 元ファイルからISBN-書名データを取得
open REF, "$ARGV[0]" or die;
my %book = ();
while (<REF>) {
	chomp;
	my $line = decode('UTF-8', $_);
	my @data = split /\t/, $line;
	$book{$data[5]} = $data[2];
}
close REF;

open SER, "$ARGV[1]" or die;
my %series = ();
while (<SER>) {
	chomp;
	my $line = decode('UTF-8', $_);
	my @data = split /\t/, $line;
	$series{$data[1]} = $data[0];
}
close SER;

# count_noun_with_sub.pl出力ファイルから名詞-使用回数データを取得
# 名詞-複合名詞データを取得
open NOUN, "$ARGV[2]" or die;
my %count = ();
my %noun = ();
while (<NOUN>) {
	chomp;
	my $line = decode('UTF-8', $_);
	my @data = split /\t/, $line;
	$count{$data[0]} = $data[2];

	my @sub_nouns = ();
	@sub_nouns = split /, /, $data[3] if $data[3];
	foreach my $sub_noun (@sub_nouns) {
		$sub_noun =~ s/\(\d+\)$//;
		push @{$noun{$data[0]}}, $sub_noun;
	}
}
close NOUN;

# get_noun_list.pl出力ファイルからシリーズ名-名詞-使用回数データを取得
open DATA, "$ARGV[3]" or die;
my %data = ();
while (<DATA>) {
	chomp;
	my $line = decode('UTF-8', $_);
	my @data = split /\t/, $line;
	if ($data[2]) {
		my @nouns = split /, /, $data[2];
		foreach my $noun (@nouns) {
			++${$data{$series{$book{$data[0]}}}}{$noun};
		}
	}
}
close DATA;

my $num = "$ARGV[4]";

my $midashi = '書名';
foreach my $noun (sort { $count{$b} <=> $count{$a} } keys %count) {
	last if $count{$noun} == $num;
	$midashi = $midashi . "\t$noun";
}
$midashi = $midashi . "\n";
print encode('UTF-8', $midashi);

foreach my $series (sort keys %data) {
	print encode('UTF-8', $series);
	foreach my $noun (sort { $count{$b} <=> $count{$a} } keys %count) {
		last if $count{$noun} == $num;
		my $sum = 0;
		$sum = ${$data{$series}}{$noun} if ${$data{$series}}{$noun};
		foreach my $sub_noun (@{$noun{$noun}}) {
			$sum += ${$data{$series}}{$sub_noun} if ${$data{$series}}{$sub_noun};
		}
		print encode('UTF-8', "\t$sum");
	}
	print encode('UTF-8', "\n");
}