Skip to content

Instantly share code, notes, and snippets.

@eligor13
Created October 27, 2014 05:36
Show Gist options
  • Select an option

  • Save eligor13/4910a259b0a68b2b4026 to your computer and use it in GitHub Desktop.

Select an option

Save eligor13/4910a259b0a68b2b4026 to your computer and use it in GitHub Desktop.

Revisions

  1. eligor13 created this gist Oct 27, 2014.
    147 changes: 147 additions & 0 deletions twpicdl2.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,147 @@
    #!/bin/sh

    # Modified by Stan Schwertly to download locally rather than to send to Posterous.
    # Github: https://github.com/Stantheman/Twitpic-Backup

    # Copyright 2010 Tim "burndive" of http://burndive.blogspot.com/
    # This software is licensed under the Creative Commons GNU GPL version 2.0 or later.
    # License informattion: http://creativecommons.org/licenses/GPL/2.0/

    # This script is a derivative of the original, obtained from here:
    # http://tuxbox.blogspot.com/2010/03/twitpic-to-posterous-export-script.html

    # Version1.2 [add retry]

    RUN_DATE=`date +%F--%H-%m-%S`

    TP_NAME=$1
    WORKING_DIR=$2

    IMG_DOWNLOAD=1
    PREFIX=twitpic-$TP_NAME
    HTML_OUT=$PREFIX-all-$RUN_DATE.html

    #CURL_OPT='-f --retry 3 --retry-delay 5 --retry-max-time 60'
    CURL_OPT='--retry 3 --retry-delay 5 --retry-max-time 60'

    # Checks the user-supplied arguments
    if [ -z "$TP_NAME" ]; then
    echo "You must supply a TP_NAME."
    exit
    fi

    if [ ! -d "$WORKING_DIR" ]; then
    echo "You must supply a WORKING_DIR."
    exit
    fi

    cd $WORKING_DIR

    # Checks for the directories it needs
    if [ ! -d "images" ]; then
    mkdir images;
    fi

    if [ ! -d "html" ]; then
    mkdir html;
    fi

    if [ ! -d "logs" ]; then
    mkdir logs;
    fi

    PAGE=0
    MAXRETRY=10
    RETRY=0

    LAST=`curl http://twitpic.com/photos/${TP_NAME} \
    | grep "<a href=.*>Last<" \
    | sed "s/.*\?page=\([0-9]*\).*/\1/"`
    if [ -z "$LAST" ]; then
    NEXT=`curl http://twitpic.com/photos/${TP_NAME} \
    | grep "<a href=.*>Next<" \
    | sed "s/.*\?page=\([0-9]*\).*/\1/"`
    if [ -z "$NEXT" ]; then
    PAGE=1
    else
    PAGE=$NEXT
    fi
    else
    PAGE=$LAST
    fi

    while [ $PAGE -ne 0 ]; do
    echo PAGE: $PAGE
    FILENAME="html/$PREFIX-page-$PAGE.html"
    echo "FILENAME=" $FILENAME
    echo "0 curl http://twitpic.com/photos/${TP_NAME}?page=$PAGE -O $FILENAME"
    if [ ! -f "$FILENAME" ]; then
    echo "0"
    # wget http://twitpic.com/photos/${TP_NAME}?page=$PAGE -O $FILENAME
    echo "1 ${TP_NAME}?page=$PAGE -O $FILENAME"
    curl http://twitpic.com/photos/${TP_NAME}?page=$PAGE -o $FILENAME $CURL_OPT
    if [ $? -eq 22 -a $RETRY -le $MAXRETRY ]; then
    RETRY=`expr $RETRY + 1`
    sleep 1
    else
    RETRY=0
    PAGE=`expr $PAGE - 1`
    fi
    else
    RETRY=0
    PAGE=`expr $PAGE - 1`
    fi
    done

    ALL_IDS=`cat html/$PREFIX-page-* | grep -Eo "<a href=\"/[a-zA-Z0-9]+\">" | grep -Eo "/[a-zA-Z0-9]+" | grep -Eo "[a-zA-Z0-9]+" | grep -v "sopapipa" | sort -r | uniq | xargs`

    COUNT=0
    LOG_FILE=logs/$PREFIX-log-$RUN_DATE.txt

    echo $ALL_IDS | tee -a $LOG_FILE

    for ID in $ALL_IDS; do
    COUNT=`expr $COUNT + 1`
    echo $ID: $COUNT | tee -a $LOG_FILE

    echo "Processing $ID..."
    FULL_HTML="html/$PREFIX-$ID-full.html"
    # wget http://twitpic.com/$ID -O $FULL_HTML
    if [ ! -f "$FULL_HTML" ]; then
    RETRY=$MAXRETRY
    while [ $RETRY -ne 0 ]; do
    echo "2 " curl http://twitpic.com/$ID -O $FULL_HTML
    curl http://twitpic.com/$ID -o $FULL_HTML $CURL_OPT
    if [ $? -eq 22 ]; then
    RETRY=`expr $RETRY - 1`
    sleep 1
    else
    RETRY=0
    fi
    done
    fi

    FULL_URL=`grep "<img src" $FULL_HTML | grep -Eo "src=\"[^\"]*\"" | grep -Eo "https://[^\"]*"`

    if [ "$IMG_DOWNLOAD" -eq 1 ]; then
    EXT=`echo "$FULL_URL" | grep -Eo "[a-zA-Z0-9]+\.[a-zA-Z0-9]+\?" | head -n1 | grep -Eo "\.[a-zA-Z0-9]+"`
    if [ -z "$EXT" ]; then
    EXT=`echo "$FULL_URL" | grep -Eo "\.[a-zA-Z0-9]+$"`
    fi
    FULL_FILE=$PREFIX-$ID-full$EXT
    # wget "$FULL_URL" -O "images/$FULL_FILE"
    if [ ! -f "images/$FULL_FILE" ]; then
    RETRY=$MAXRETRY
    while [ $RETRY -ne 0 ]; do
    echo "3 " curl "$FULL_URL" -O "images/$FULL_FILE"
    curl "$FULL_URL" -o "images/$FULL_FILE" $CURL_OPT
    if [ $? -eq 22 ]; then
    RETRY=`expr $RETRY - 1`
    sleep 1
    else
    RETRY=0
    fi
    done
    fi
    fi
    done