#!usr/bin/perl BEGIN { use CGI::Carp qw(carpout); open(LOG, ">>_error2.txt") or die("Unable to open mycgi-log: $!\n"); carpout(LOG); } BEGIN { $SIG{"__DIE__"} = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } } # This program ranks how similar a text basefile author is to target file authors. Meant for English words. # Can work on 7KB files (1,000 words) if target files are 50KB. Both at 50KB is darn good. # Accuracy, approx: 50% in 1st place given 20 authors in same genre with 50k files. # Use SVMLink open software ranking capability for professional work. # Author: Scott Roberts, 2016. ######## TELL PROGRAM WHAT IT NEEDS TO KNOW ######### $words_to_ignore=''; # for example bitcoin|blockchain|node $basefile='satoshi_all.txt'; # unknown author. Stays in directory with this program $basesize=-s $basefile; # get size of file in bytes $basesize=2000000; # in case you need to make it smaller than above for small target files. $oversize=1; # useful if all target files are a lot bigger than unknown author by some factor >1. $buffer=1.25; # this pulls in more than needed to make sure enough words are obtained $targetfilesdir='books'; # all files > 30% bigger than base file to make sure enough words are retireved. ######## PRINT HTML HEADER ####### print "Content-type: text/html\n\n

Author Comparison

Base text: $basefile $basesize bytes. Target texts directory: $targetfilesdir
words to ignore: $words_to_ignore
using only first $basesize x $oversize bytes of target files

"; ####### RUN PROGRAM ###### open(F,"<$basefile") or die $!; read(F,$c,$basesize); close F; %base_count=get_words($c); # stores count (value) of each word (key). chdir "c:\\_all\\programs\\indigo-perl-new\\apache-2.2.11\\cgi-bin\\$targetfilesdir"; @files=glob('*.txt'); foreach $file (@files) { open(F,"<$file") or die $!; read(F,$c,$basesize*$buffer*$oversize); close F; # 1.3= a buffer %target_count=get_words($c); get_score(); undef %target_count; } ###### FINISHED ----- PRINT RESULTS ########## print "First $total_words words from base text above and target texts below were compared.

"; @ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores; foreach $file (@ranked) { $rank++; print "$rank = " . int($scores{$file}*10/$total_words) . " $file
"; } exit; ######## BEGIN SUBROUTINES ######### sub get_words { $c=$_[0]; if ($words_to_ignore ne '') {$c=~s/$words_to_ignore / /gsi;} $c=~s/\r/\n/gs; $c=~s/[^a-zA-Z ]//gs; # get rid of windows returns and remove all non-alpha and spaces $c=~s/\n//gs; $c=~s/ +/ /gs; # get rid of newlines and excess spaces. @c=split(" ", $c); if ($firsttime eq '') { $total_words=$#c; $firsttime='nope';} else { $#c=$total_words*$oversize; } undef %count; foreach $c (@c) { $count{$c}++;} return %count; } sub get_score { foreach $word (keys %base_count) { $b=$base_count{$word}; if ($target_count{$word} < 1 ){ $t=0.5/$total_words/$oversize; } else { $t =$target_count{$word}/$oversize; } if ($t > $b) { $scores{$file}+=($t/$b)**0.6; } else { $scores{$file}+=($b/$t)**0.6; } }}