加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 大数据 > 正文

不想做诗人的程序员不是一个好爸爸

发布时间:2020-12-15 21:11:20 所属栏目:大数据 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 #!/usr/bin/perluse strict;#use warnings;use encoding 'utf8';use Data::Dumper;sub dump_chars();sub dump_sentenses();sub dump_index();sub reco

以下代码由PHP站长网 52php.cn收集自互联网

现在PHP站长网小编把它分享给大家,仅供参考

#!/usr/bin/perl

use strict;
#use warnings;
use encoding 'utf8';
use Data::Dumper;

sub dump_chars();
sub dump_sentenses();
sub dump_index();
sub recount_chars;
sub mark_chars;
sub dump_result;
sub count_unsed_chars;
sub count_weight;

# 读取字频表
my %chars;
open(CHAR_FILE,'<','lex-chars.lex');
while (my $line = <CHAR_FILE>) {
    chomp($line);
    my @items = split(///,$line);
    print "bad line: $linen" if (scalar(@items) != 3);
    $chars{$items[0]} = $items[2];
}
close(CHAR_FILE);

# 排序,按字频输出
my @sorted_chars = sort { $chars{$b} <=> $chars{$a} } keys %chars;
#dump_chars();

# 筛选出频率前1000个字
my %top_chars;
for (my $i = 0; $i < 2000; $i++) {
    $top_chars{$sorted_chars[$i]} = 1;
}

# 读取名句语料库
my %sentenses;
my %char_sentenses;
my %first_chars;
my %last_chars;
while (my $file = <r01*>) {
    # print "reading $filen";
    open(FILE,$file);
    
    while (my $line = <FILE>) {
        chomp($line);
        $line =~ s/^s+//g;
        $line =~ s/s+$//g;
        next if (!$line);
        $line =~ s/。$//;
        $line =~ s/!$//;
        $line =~ s/?$//;
        $line =~ s/,$//;
        next if ($line =~ /^—/);
        next if (length($line) < 10);
        next if (length($line) > 16);
        next if ($line =~ /^(/);
        next if ($line =~ /《/);
        next if ($line =~ /[0-9A-Za-z]/);
        next if ($line =~ /^,/);
        next if ($line =~ /-/);
        $sentenses{$line}++;
        
        my $first_char = substr($line,1);
        my $last_char = substr($line,length($line) - 1,1);
        $first_chars{$first_char} = 1;
        $last_chars{$last_char} = 1;
    }
    
    close(FILE);
}

# 排序,按句频输出
my @sorted_sentenses = sort { $sentenses{$b} <=> $sentenses{$a} } keys %sentenses;
#dump_sentenses();

# 过滤只出现过一次的句子
#while ($sentenses{$sorted_sentenses[$#sorted_sentenses]} < 2) {
#    pop(@sorted_sentenses);
#}
print 'total sentenses: ' . scalar(@sorted_sentenses) . "n";

# 过滤末字无法接的句子
foreach my $sentense (keys(%sentenses)) {
    my $last_char = substr($sentense,length($sentense) - 1,1);
    if (!$first_chars{$last_char}) {
        #print "delete $sentensen";
        delete($sentenses{$sentense});
    }
}
print 'total sentenses: ' . scalar(%sentenses) . "n";

# 过滤首字无法接的句子
foreach my $sentense (keys(%sentenses)) {
    my $first_char = substr($sentense,1);
    if (!$last_chars{$first_char}) {
        #print "delete $sentensen";
        delete($sentenses{$sentense});
    }
}
print 'total sentenses: ' . scalar(%sentenses) . "n";

foreach my $sentense (keys(%sentenses)) {
    # 建立首字索引
    my $first_char = substr($sentense,1);
    if (!$char_sentenses{$first_char}) {
        $char_sentenses{$first_char} = [];
    }

    push($char_sentenses{$first_char},$sentense);
}
        
# 输出索引信息
#dump_index();

# 统计top字出现在各句子的次数
my %char_in_sentense;
foreach my $char (keys(%top_chars)) {
    my $count = 0;
    foreach my $sentense (@sorted_sentenses) {
        $count++ if (index($sentense,$char) >= 0);
    }
    $char_in_sentense{$char} = $count;
}
my @sorted_char_in_sentenses = sort { $char_in_sentense{$a} <=> $char_in_sentense{$b} } keys %char_in_sentense;
#my $i = 0;
#foreach my $c (@sorted_char_in_sentenses) {
#    $i++;
#    print "$i:t$ct " . $char_in_sentense{$c} . "n";
#}

# 以任意一个句子开头,接龙,首字不重复,直至不能接或首尾相连
my %used_first_chars;
my %used_sentenses;
my %used_chars;
my $char_count = 0;
my @sentense_queue;
my $max_char_count = 0;
my $max_depth = 0;
my $max_loop_depth = 0;
my $lastlog = time();
$| = 1; # disable IO buffer

sub traverse {
    my ($sentense,$depth,$progress) = @_;
    
    # 每分钟显示最新进度
    if (time() - $lastlog > 60) {
        print "sentense: $sentense,depth: $depth,count: $char_count,total progress: $progressn";
        $lastlog = time();
    }
    
    return if ($used_sentenses{$sentense});
    $depth++;
    $used_sentenses{$sentense} = 1; # 标记句子已使用
    mark_chars($sentense);
    push(@sentense_queue,$sentense);
    
    if ($depth != scalar(@sentense_queue)) {
        print "$sentense: $depthn";
        dump_result();
        exit;
    }
    
    if ($depth > $max_depth) {
        $max_depth = $depth;
        print "new depth: $depthn";
        dump_result();
    }
    
    # 找到覆盖1000个字的方案(实际上100句职能覆盖500多个字)
    if ($char_count >= 1000) {
        if ($char_count >= 1000) {
            print "found one: $depthn";
            dump_result();
            exit;
        }
    }
    
    if ($depth >= 100) {
        if ($char_count > $max_char_count) {
            $max_char_count = $char_count;
            print "new record: $max_char_countn";
            dump_result();
        }
        #print "too deep: $char_countn";
        pop(@sentense_queue);
        return;
    }
        
    my $last_char = substr($sentense,1);
    
    #print $last_char;
    # 找到闭环
    if ($last_char eq substr($sentense_queue[0],1)) {
        if ($depth > $max_loop_depth) {
            $max_loop_depth = $depth;
            print "new loop: $depthn";
            dump_result();
        }
        pop(@sentense_queue);
        return;
    }
    
    if ($char_sentenses{$last_char}) {
        my %child_sentenses;
        foreach my $s (@{$char_sentenses{$last_char}}) {
            if (!$used_sentenses{$s}) {
                my $count = count_weight($s);
                if ($count > 0) {
                    $child_sentenses{$s} = $count;
                }
            }
        }
      
        my @sorted_children = sort { $child_sentenses{$b} <=> $child_sentenses{$a} } keys %child_sentenses;
        my $i = 0;
        my $len = scalar(@sorted_children);
        foreach my $s (@sorted_children) {
            $i++;
            my $first_char = substr($s,1);
            if (!$used_first_chars{$first_char}) {
                $used_first_chars{$first_char} = 1;
                traverse($s,$progress * ($len - $i + 1));
                delete($used_sentenses{$s});
                delete($used_first_chars{$first_char});
                recount_chars();
            }
        }
        pop(@sentense_queue);
    } else {
        pop(@sentense_queue);
        #print "dead at $sentense: $depth; last char:$last_charn";
    }
}

# 挑选第一个句子
my %child_sentenses;
foreach my $s (keys(%sentenses)) {
    my $count = count_weight($s);
    if ($count > 0) {
        $child_sentenses{$s} = $count;
    }
}
my @sorted_children = sort { $child_sentenses{$b} <=> $child_sentenses{$a} } keys %child_sentenses;
my $i = 0;
my $len = scalar(@sorted_children);
foreach my $s (@sorted_children) {
    $i++;
    my $first_char = substr($s,1);
    $used_first_chars{$first_char} = 1;
    traverse($s,$len - $i + 1);
    delete($used_sentenses{$s});
    delete($used_first_chars{$first_char});
    recount_chars();
    print "depth: 0,total progress: " . ($len - $i + 1) .
        ",progress: $i/$len,weight: " . $child_sentenses{$s} . "n";
}

sub dump_chars() {
    my $i = 0;
    foreach my $char (@sorted_chars) {
        $i++;
        print "$it $chart $chars{$char}n";
    }
}

sub dump_sentenses() {
    my $i = 0;
    foreach my $sentense (@sorted_sentenses) {
        $i++;
        print "$it $sentenset $sentenses{$sentense}n";
    }
}

sub dump_index() {
    foreach my $char (keys(%char_sentenses)) {
        print "$char: " . scalar(@{$char_sentenses{$char}}) . "n";
    }
}

sub mark_chars {
    my $sentense = shift;
    for (my $i = 0,my $len = length($sentense); $i < $len; $i++) {
        my $char = substr($sentense,$i,1);
        if ($top_chars{$char} && !$used_chars{$char}) {
          $char_count++;
        }
        $used_chars{$char} = 1;
    }
}

sub count_weight {
    my $sentense = shift;
    my $weight = 0;
    for (my $i = 0,1);
        if ($top_chars{$char} && !$used_chars{$char}) {
            if ($char_in_sentense{$char}) {
                $weight += 1 / $char_in_sentense{$char};
            }
        }
    }
    return $weight;
}

sub count_unsed_chars {
    my $sentense = shift;
    my $count = 0;
    for (my $i = 0,1);
        if ($top_chars{$char} && !$used_chars{$char}) {
          $count++;
        }
    }
    return $count;
}

sub recount_chars {
    # 清空计算器
    undef %used_chars;
    $char_count = 0;
    
    foreach my $sentense (keys(%used_sentenses)) {
        mark_chars($sentense);
    }
} 

sub dump_result {
    print "chars: " . keys(%used_chars) . "n";
    print "char_count: $char_countn";
    my $i = 0;
    foreach my $sentense (@sentense_queue) {
        $i++;
        print "$i: $sentensen";
    }
}

以上内容由PHP站长网【52php.cn】收集整理供大家参考研究

如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读