Skip to content

Instantly share code, notes, and snippets.

@userid
Last active April 20, 2020 16:30
Show Gist options
  • Select an option

  • Save userid/fecee8535dcebf3197b7347c123a653c to your computer and use it in GitHub Desktop.

Select an option

Save userid/fecee8535dcebf3197b7347c123a653c to your computer and use it in GitHub Desktop.

Revisions

  1. userid revised this gist Apr 20, 2020. 1 changed file with 90 additions and 0 deletions.
    90 changes: 90 additions & 0 deletions crontab_download_generate.pl
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,90 @@
    #!/usr/bin/perl -w
    ##
    ##
    ##

    use 5.012;
    $| = 1;
    use POSIX qw(strftime);

    chdir '/home/odps/ods_ac_serverconfig/';

    my $date = strftime("%Y%m%d", localtime(time()-86400));
    my $sql = "tunnel download cbods.ods_ac_serverconfig/requestdate='$date' -fd '|' data/serverconfig/serverconfig.$date.txt;";

    say $sql;
    say qx{ /opt/odps/bin/odpscmd -e "$sql"};
    my $downloadfile = "data/serverconfig/serverconfig.$date.txt";
    die("file download error") unless -s $downloadfile;

    ###_---- 开始解析

    open my $f_user, "+>", "huge/user.csv";
    open my $f_device, "+>", "huge/device.csv";
    open my $f_udevice, "+>", "huge/udevice.csv";
    open my $f_used, "+>", "huge/used.csv";
    open my $f_contains, "+>", "huge/contains.csv";

    sub get_uuid_time($){
    use POSIX qw(strftime);
    my $uuid= shift || 'ef802820-46b3-11e2-bf3a-47ef6b3e28e2';
    $uuid =~ s/-//g;

    my $timelow = hex substr( $uuid, 2 * 0, 2 * 4 );
    my $timemid = hex substr( $uuid, 2 * 4, 2 * 2 );
    my $version = hex substr( $uuid, 2 * 6, 1 );
    my $timehi = hex substr( $uuid, 2 * 6 + 1, 2 * 2 - 1 );

    my $time = ( $timehi * ( 2**16 ) + $timemid ) * ( 2**32 ) + $timelow;
    my $epoc = int( $time / 10000000 ) - 12219292800;

    #return strftime( "%Y%m%d", localtime($epoc) ) ;
    return strftime( "%Y-%m-%d %H:%M:%S", localtime($epoc) );
    }
    sub process($){

    my $l = shift;
    chomp $l;
    my @vals = split '\\|', $l ;
    return if scalar @vals <5;
    my $userid = $vals[0];
    my $type = $vals[1];
    my $model = $vals[2];
    $model=~s/,/_/;
    my $mac = $vals[3];
    my $uuid = $vals[-1] || '';

    return if $userid eq '';
    return if $userid == 0;
    my $date = get_uuid_time($uuid);

    #say "$userid $model $mac $uuid $date";
    print $f_user "$userid\n";
    print $f_device "$mac,$model,$type\n";
    print $f_udevice "$uuid,$date,$type\n";
    print $f_used "$userid,$uuid\n";
    print $f_contains "$uuid,$mac\n";


    }

    my $n = 0;
    open my $fd,"<", $downloadfile;
    while(<$fd>){

    process($_) if ( $_ ne '');
    say "processed $n lines." if $n++ % 1000 == 0;

    }
    close($fd);
    close($f_user);
    close $f_device;
    close $f_udevice;
    close $f_user;
    close $f_contains;

    die("解析文件出错" ) unless -s "huge/user.csv";

    #__----- 开始导入hugegraph

    say qx {/home/dev/hugegraph-loader-0.7.0/bin/hugegraph-loader -g hugegraph -f huge/s.json -s huge/schema.groovy -h 10.10.14.33 --max-parse-errors 10000 --max-insert-errors 10000};
  2. userid created this gist Apr 20, 2020.
    37 changes: 37 additions & 0 deletions odps_to_clickhouse_download.pl
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,37 @@
    #!/usr/bin/perl -w
    ##
    ##
    ##

    use 5.012;
    $| = 1;
    use POSIX qw(strftime);

    chdir '/home/odps/';
    my $date = strftime("%Y%m%d", localtime(time()-86400*1));
    my $partition = strftime("%Y-%m-%d", localtime(time()-86400*1));
    my $sql = "tunnel download cbmds.mds_user_attribute_original_big_data_table/requestdate=${date} -fd '\x01' ./data/user_attribute_${date}.txt ";
    say $sql;

    say "=== 开始下载odps ===";

    say qx{ /opt/odps/bin/odpscmd -e "$sql"};
    my $downloadfile = "./data/user_attribute_${date}.txt";
    die("file download error") unless -s $downloadfile;

    #__----- 开始导入
    #
    say "=== 开始导入clickhouse ===";

    say qx{ cat $downloadfile |sed -e "s/['\\"{}\\r\\t]//g" -e 's/\$/\\x01${partition}/' | clickhouse-client --format_csv_delimiter='\x01' --query="INSERT INTO user_attribute FORMAT CSV" --input_format_allow_errors_num=10000000};


    #__----- 开始清理旧数据
    my $old_date = strftime("%Y%m%d", localtime(time()-86400*9));
    my $old_partition = strftime("%Y-%m-%d", localtime(time()-86400*9));
    say "=== drop table partition '$old_partition' ===";
    qx{rm ./data/user_attribute_${old_date}.txt };
    say qx{ clickhouse-client --query=" alter table user_attribute drop partition '$old_partition' "};


    say "=== Done ===\n,", `date`;