#!/usr/bin/env bash
mysql_exec(){
mysql -e "$1" reddit || echo "$1" >&2
}
NOW_BEGINS_PUSHSHIFT="<<PUSHSHIFT>>"
get_cmnts(){
# Get comments from reddit.com, not pushshift
if [ -f /tmp/rscrape-submission.json ]; then
cat /tmp/rscrape-submission.json | jq '.[1].data |
[ walk(if type=="object" and has("replies") and (.replies | type=="object")
then .id as $n | .replies.data.children |= map(.data.parent_id = $n)
else . end)
| ..
| select(type=="object" and has("body"))
| {id, ups, controversiality, parent_id, author, author_fullname, created, score, body}
] | map(if .parent_id | startswith("t3_") then .parent_id="t3_" else .parent_id="t1_"+.parent_id end)[] | [.id, .author, .author_fullname, .created, .parent_id, .score, .body] | @csv' \
| sed 's/^"\|"$//g' \
| sed 's/\\"/"/g'
rm /tmp/rscrape-submission.json
echo "Done reddit results" >&2
sleep 10
fi
echo "$NOW_BEGINS_PUSHSHIFT"
if [ "$PUSHSHIFT_TOO" ]; then
cat /tmp/rscrape-submission.cmnts.json \
| jq '.data[] | [ .id, .author, .author_fullname, .created_utc, .parent_id, .score, .body] | @csv' \
| sed 's/^"\|"$//g' \
| sed 's/\\"/"/g'
fi
}
if [ "$COOKIES" = "" ]; then
echo "WARNING: No cookies supplied" >&2
fi
if [ $# -eq 0 ]; then
echo "USAGE"
echo " ARGS"
echo " [[URLS]]"
echo " ENV VARS"
echo " PUSHSHIFT_TOO"
exit
fi
declare -i limit=100
for url in "$@"; do
sleep 2
url="$(echo "$url" | sed -E 's_^https?://[a-z]+[.]_https://old._g')" # Using JSON format from old.reddit.com
#read subreddit_name submission_id_str < <(echo "$url" | sed -E 's_^https://[a-z]+\.reddit\.com/r/([^/]+)/comments/([^/]+)/[^/]+/$_\1 \2_g')
echo "Scraping $url" >&2
submission_id_str="$(echo "$url" | sed -E 's_^https?://[a-z]+\.reddit\.com/r/[^/]+/comments/([a-z0-9]+)/.*$_\1_g')"
submission_id="$(rscraper-str2id "$submission_id_str")"
# "$(mysql -r -s --disable-column-names -e "SELECT id FROM post WHERE id=$submission_id" reddit)" = ""
if [ true ]; then
echo "Getting submission data from reddit.com" >&2
curl "$url.json" -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Accept-Language: en-GB,en;q=0.5' --compressed -H 'Connection: keep-alive' -H "Cookie: ${COOKIES}" -H 'Upgrade-Insecure-Requests: 1' -H 'DNT: 1' -H 'Pragma: no-cache' -H 'Cache-Control: no-cache' -H 'TE: Trailers' -o /tmp/rscrape-submission.json
IFS=',' read submisison_id_str subreddit_name subreddit_id t user_id_str username url title_and_text < <(cat /tmp/rscrape-submission.json \
| jq '.[0].data.children[0].data | [.id, .subreddit, .subreddit_id, .created, .author_fullname, .author, .url, .title, .selftext] | @csv' \
| sed 's/^"\|"$//g' \
| sed 's/\\"/"/g')
user_id="$(rscraper-str2id "${user_id_str:4:-1}")"
mysql_exec "
INSERT INTO user
(id,name)
VALUES
($user_id,$username)
ON DUPLICATE KEY UPDATE id=id
"
mysql_exec "
INSERT INTO post
(id,user,t,url,title,text)
VALUES
($submission_id,$user_id,$t,$url,$title_and_text)
ON DUPLICATE KEY UPDATE id=id
"
fi
before_t="$(date +%s)"
while true; do
if [ "$PUSHSHIFT_TOO" ]; then
echo "Getting results before $before_t" >&2
url="https://api.pushshift.io/reddit/comment/search?link_id=t3_${submission_id_str}&before=${before_t}&filter=id,author_fullname,author,parent_id,created_utc,score,body&size=${limit}"
echo "Getting $url" >&2
curl -s "$url" -o /tmp/rscrape-submission.cmnts.json
fi
declare -i i=0
while IFS=',' read -r id username user_id_str t parent_id score body; do
if [ "$id" = "$NOW_BEGINS_PUSHSHIFT" ]; then
i=0
continue
fi
((++i))
before_t="$t"
parent_id="$(rscraper-str2id "${parent_id:4:-1}")"
if [ "$parent_id" = "$submission_id" ]; then
parent_id=0
fi
user_id="$(rscraper-str2id "${user_id_str:4:-1}")"
mysql_exec "
INSERT INTO user
(id,name)
VALUES
($user_id,$username)
ON DUPLICATE KEY UPDATE id=id
"
mysql_exec "
INSERT INTO cmnt
(id,post,parent,user,t,n_likes,content)
VALUES
($(rscraper-str2id "${id:1:-1}"), $submission_id, $parent_id, $user_id, $t, $score, $body)
AS A
ON DUPLICATE KEY UPDATE
n_likes=IF((ABS(cmnt.n_likes)>ABS(A.n_likes)),cmnt.n_likes,A.n_likes),
content=IF(cmnt.content='[deleted]',A.content,cmnt.content),
user=IF(cmnt.user=0,A.user,cmnt.user)
"
done < <(get_cmnts)
echo "Got $i results" >&2
# Retrieved latest first, so t is the earliest timestamp retrieved
if [ $i -lt $limit ]; then
break
fi
sleep 10
done
done
exit
# reddit database structure (can be extended)
CREATE TABLE user (
id BIGINT UNSIGNED NOT NULL PRIMARY KEY,
name VARBINARY(60),
UNIQUE KEY (name)
);
CREATE TABLE post (
id BIGINT UNSIGNED NOT NULL PRIMARY KEY,
user BIGINT UNSIGNED NOT NULL,
t BIGINT UNSIGNED NOT NULL DEFAULT 0,
url VARBINARY(200),
title VARCHAR(300) NOT NULL,
text BLOB(40000),
FOREIGN KEY (user) REFERENCES user (id)
);
CREATE TABLE cmnt (
id BIGINT UNSIGNED NOT NULL PRIMARY KEY,
post BIGINT UNSIGNED NOT NULL,
parent BIGINT UNSIGNED NOT NULL,
user BIGINT UNSIGNED NOT NULL,
t BIGINT UNSIGNED NOT NULL DEFAULT 0,
content VARCHAR(10000),
FOREIGN KEY (post) REFERENCES post (id),
-- FOREIGN KEY (parent) REFERENCES cmnt (id),
FOREIGN KEY (user) REFERENCES user (id)
);
# Command to register all new reddit subbmissions in the tagem database
INSERT INTO file2post (db,file,post)
SELECT db.id, f.id, p.id
FROM file f
JOIN dir d ON d.id=f.dir
JOIN reddit.post p ON (p.url=CONCAT(d.name, f.name) OR CAST(p.id AS BINARY(20))=SUBSTRING_INDEX(SUBSTRING_INDEX(f.name, '/', 2), '/', -1)) -- Assuming posts are named as files 'comments/<ID>/<TITLE>'
JOIN external_db db ON db.name="reddit"
ON DUPLICATE KEY UPDATE db=db
;