#!/usr/bin/env bash mysql_exec(){ mysql -e "$1" reddit || echo "$1" >&2 } NOW_BEGINS_PUSHSHIFT="<<PUSHSHIFT>>" get_cmnts(){ # Get comments from reddit.com, not pushshift if [ -f /tmp/rscrape-submission.json ]; then cat /tmp/rscrape-submission.json | jq '.[1].data | [ walk(if type=="object" and has("replies") and (.replies | type=="object") then .id as $n | .replies.data.children |= map(.data.parent_id = $n) else . end) | .. | select(type=="object" and has("body")) | {id, ups, controversiality, parent_id, author, author_fullname, created, score, body} ] | map(if .parent_id | startswith("t3_") then .parent_id="t3_" else .parent_id="t1_"+.parent_id end)[] | [.id, .author, .author_fullname, .created, .parent_id, .score, .body] | @csv' \ | sed 's/^"\|"$//g' \ | sed 's/\\"/"/g' rm /tmp/rscrape-submission.json echo "Done reddit results" >&2 sleep 10 fi echo "$NOW_BEGINS_PUSHSHIFT" if [ "$PUSHSHIFT_TOO" ]; then cat /tmp/rscrape-submission.cmnts.json \ | jq '.data[] | [ .id, .author, .author_fullname, .created_utc, .parent_id, .score, .body] | @csv' \ | sed 's/^"\|"$//g' \ | sed 's/\\"/"/g' fi } if [ "$COOKIES" = "" ]; then echo "WARNING: No cookies supplied" >&2 fi if [ $# -eq 0 ]; then echo "USAGE" echo " ARGS" echo " [[URLS]]" echo " ENV VARS" echo " PUSHSHIFT_TOO" exit fi declare -i limit=100 for url in "$@"; do sleep 2 url="$(echo "$url" | sed -E 's_^https?://[a-z]+[.]_https://old._g')" # Using JSON format from old.reddit.com #read subreddit_name submission_id_str < <(echo "$url" | sed -E 's_^https://[a-z]+\.reddit\.com/r/([^/]+)/comments/([^/]+)/[^/]+/$_\1 \2_g') echo "Scraping $url" >&2 submission_id_str="$(echo "$url" | sed -E 's_^https?://[a-z]+\.reddit\.com/r/[^/]+/comments/([a-z0-9]+)/.*$_\1_g')" submission_id="$(rscraper-str2id "$submission_id_str")" # "$(mysql -r -s --disable-column-names -e "SELECT id FROM post WHERE id=$submission_id" reddit)" = "" if [ true ]; then echo "Getting submission data from reddit.com" >&2 curl "$url.json" -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Accept-Language: en-GB,en;q=0.5' --compressed -H 'Connection: keep-alive' -H "Cookie: ${COOKIES}" -H 'Upgrade-Insecure-Requests: 1' -H 'DNT: 1' -H 'Pragma: no-cache' -H 'Cache-Control: no-cache' -H 'TE: Trailers' -o /tmp/rscrape-submission.json IFS=',' read submisison_id_str subreddit_name subreddit_id t user_id_str username url title_and_text < <(cat /tmp/rscrape-submission.json \ | jq '.[0].data.children[0].data | [.id, .subreddit, .subreddit_id, .created, .author_fullname, .author, .url, .title, .selftext] | @csv' \ | sed 's/^"\|"$//g' \ | sed 's/\\"/"/g') user_id="$(rscraper-str2id "${user_id_str:4:-1}")" mysql_exec " INSERT INTO user (id,name) VALUES ($user_id,$username) ON DUPLICATE KEY UPDATE id=id " mysql_exec " INSERT INTO post (id,user,t,url,title,text) VALUES ($submission_id,$user_id,$t,$url,$title_and_text) ON DUPLICATE KEY UPDATE id=id " fi before_t="$(date +%s)" while true; do if [ "$PUSHSHIFT_TOO" ]; then echo "Getting results before $before_t" >&2 url="https://api.pushshift.io/reddit/comment/search?link_id=t3_${submission_id_str}&before=${before_t}&filter=id,author_fullname,author,parent_id,created_utc,score,body&size=${limit}" echo "Getting $url" >&2 curl -s "$url" -o /tmp/rscrape-submission.cmnts.json fi declare -i i=0 while IFS=',' read -r id username user_id_str t parent_id score body; do if [ "$id" = "$NOW_BEGINS_PUSHSHIFT" ]; then i=0 continue fi ((++i)) before_t="$t" parent_id="$(rscraper-str2id "${parent_id:4:-1}")" if [ "$parent_id" = "$submission_id" ]; then parent_id=0 fi user_id="$(rscraper-str2id "${user_id_str:4:-1}")" mysql_exec " INSERT INTO user (id,name) VALUES ($user_id,$username) ON DUPLICATE KEY UPDATE id=id " mysql_exec " INSERT INTO cmnt (id,post,parent,user,t,n_likes,content) VALUES ($(rscraper-str2id "${id:1:-1}"), $submission_id, $parent_id, $user_id, $t, $score, $body) AS A ON DUPLICATE KEY UPDATE n_likes=IF((ABS(cmnt.n_likes)>ABS(A.n_likes)),cmnt.n_likes,A.n_likes), content=IF(cmnt.content='[deleted]',A.content,cmnt.content), user=IF(cmnt.user=0,A.user,cmnt.user) " done < <(get_cmnts) echo "Got $i results" >&2 # Retrieved latest first, so t is the earliest timestamp retrieved if [ $i -lt $limit ]; then break fi sleep 10 done done exit # reddit database structure (can be extended) CREATE TABLE user ( id BIGINT UNSIGNED NOT NULL PRIMARY KEY, name VARBINARY(60), UNIQUE KEY (name) ); CREATE TABLE post ( id BIGINT UNSIGNED NOT NULL PRIMARY KEY, user BIGINT UNSIGNED NOT NULL, t BIGINT UNSIGNED NOT NULL DEFAULT 0, url VARBINARY(200), title VARCHAR(300) NOT NULL, text BLOB(40000), FOREIGN KEY (user) REFERENCES user (id) ); CREATE TABLE cmnt ( id BIGINT UNSIGNED NOT NULL PRIMARY KEY, post BIGINT UNSIGNED NOT NULL, parent BIGINT UNSIGNED NOT NULL, user BIGINT UNSIGNED NOT NULL, t BIGINT UNSIGNED NOT NULL DEFAULT 0, content VARCHAR(10000), FOREIGN KEY (post) REFERENCES post (id), -- FOREIGN KEY (parent) REFERENCES cmnt (id), FOREIGN KEY (user) REFERENCES user (id) ); # Command to register all new reddit subbmissions in the tagem database INSERT INTO file2post (db,file,post) SELECT db.id, f.id, p.id FROM file f JOIN dir d ON d.id=f.dir JOIN reddit.post p ON (p.url=CONCAT(d.name, f.name) OR CAST(p.id AS BINARY(20))=SUBSTRING_INDEX(SUBSTRING_INDEX(f.name, '/', 2), '/', -1)) -- Assuming posts are named as files 'comments/<ID>/<TITLE>' JOIN external_db db ON db.name="reddit" ON DUPLICATE KEY UPDATE db=db ;