#!/usr/bin/perl

# get_noun_list.plの出力ファイルから、各名詞の使用回数をカウントして出力する。
# perl noun_count_per_book get_noun_list.plの出力ファイル > 出力ファイル

use strict;
use warnings;
use Encode qw/decode/;
use Encode qw/encode/;
use utf8;

open IN, "$ARGV[0]" or die;
my %count = ();
while (<IN>) {
	chomp;
	my $line = decode('UTF-8', $_);
	my($isbn, $sentence, $noun_list) = split /\t/, $line;
	my @nouns = split /, /, $noun_list;
	foreach my $noun (@nouns) {
		++$count{$noun};
	}
}
close IN;

my %data = ();
foreach my $noun (sort { $count{$b} <=> $count{$a} } keys %count) {
	my @sub_group = ();
	foreach my $noun2 (sort { $count{$b} <=> $count{$a} } keys %count) {
		my $noun3 = $noun;
		$noun3 =~ s/([\(\)\[\]\?])/\$1/g;
		if ($noun2 =~ /\b$noun3\b/ and $noun2 ne $noun) {
			push @sub_group, $noun2;
		}
	}

	my $count2 = $count{$noun};
	my @sub_words = ();
	foreach my $sub_word (@sub_group) {
		push @sub_words, "$sub_word($count{$sub_word})";
		$count2 += $count{$sub_word};
	}
	my $sub_word_list = join ', ', @sub_words;
	$data{"$noun\t$count{$noun}\t$count2\t$sub_word_list\n"} = $count2;
}

foreach my $line (sort { $data{$b} <=> $data{$a} } keys %data) {
	print encode('UTF-8', "$line");
}
