R
R code Copy
# RImports
library("glue")
library("jsonlite")
library("reticulate")
library("rjson")
download_subreddit <- '
from pmaw import PushshiftAPI
import json
import os
def download_subreddit(subreddit, tmp_dir):
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
output_file = \'%s/data.json\' % tmp_dir
if not os.path.exists(output_file) or os.stat(output_file).st_size == 0:
api = PushshiftAPI(num_workers=16, limit_type=\'backoff\', jitter=
\'decorr\')
posts = api.search_submissions(subreddit=subreddit, limit=
10000000000.0, after=0)
with open(output_file, \'w\') as f:
for post in posts:
line = json.dumps(post) + chr(10)
f.write(line)
'
read.json.for.table <- function(static_data_table_name, dataset_name,
columns) {
columns <- fromJSON(columns)
file.name <- glue("/tmp/{dataset_name}/{static_data_table_name}/data.json",
dataset_name = dataset_name, static_data_table_name = static_data_table_name)
data <- stream_in(file(file.name))
sprintf("Downloaded dataframe with %d rows and %d columns.",
nrow(data), ncol(data))
return(data)
}
# ## Downloading data from the Pushshift API
# Data for this particular asset(s) is located in the Pushshift API.
# We need to download it to a local directory first, before we can
# do anything with it.
#
download_json <- py_run_string(paste(download_subreddit, "download_subreddit(tmp_dir='/tmp/local_subreddits/frankfurt', subreddit=\n 'frankfurt')\n",
sep = "\n"))
# ## R data frame from newline-delimited JSON file
# We convert an intermediary newline-delimited JSON file to an R data frame.
r_data <- read.json.for.table(static_data_table_name = "frankfurt",
dataset_name = "local_subreddits", columns = "[\"id\",\"author\",\"subreddit\",\"created_utc\",\"title\",\"selftext\"]")
Pandas
requirements.txt Copy
- pandas==1.2.3
- pmaw==1.0.4
pip command Copy
pip3 install pandas==1.2.3 pmaw==1.0.4
Pandas code Copy
# PythonImports
import json
import os
import pandas as pd
from pmaw import PushshiftAPI
def download_subreddit(subreddit, tmp_dir):
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
output_file = "%s/data.json" % tmp_dir
if not os.path.exists(output_file) or os.stat(output_file).st_size == 0:
api = PushshiftAPI(
num_workers=16, limit_type="backoff", jitter="decorr"
)
posts = api.search_submissions(
subreddit=subreddit, limit=10000000000.0, after=0
)
with open(output_file, "w") as f:
for post in posts:
line = json.dumps(post) + chr(10)
f.write(line)
def read_json_into_pandas(static_data_table_name, dataset_name, columns):
columns = json.loads(columns)
with open(
"/tmp/{dataset_name}/{static_data_table_name}/data.json".format(
dataset_name=dataset_name,
static_data_table_name=static_data_table_name,
)
) as f:
raw_records = [json.loads(l) for l in f.readlines()]
records = [[r[k] for k in columns] for r in raw_records]
return pd.DataFrame(records, columns=columns)
# ## Downloading data from the Pushshift API
# Data for this particular asset(s) is located in the Pushshift API.
# We need to download it to a local directory first, before we can
# do anything with it.
#
download_json = download_subreddit(
tmp_dir="/tmp/local_subreddits/frankfurt", subreddit="frankfurt"
)
# ## Pandas from newline-delimited JSON file
# We convert an intermediary newline-delimited JSON file to a Pandas
# dataframe.
pandas_data = read_json_into_pandas(
columns='["id","author","subreddit","created_utc","title","selftext"]',
dataset_name="local_subreddits",
static_data_table_name="frankfurt",
)
Numpy
requirements.txt Copy
numpy==1.20.1
pmaw==1.0.4
pip command Copy
pip3 install numpy==1.20.1 pmaw==1.0.4
Numpy code Copy
# PythonImports
import json
import numpy as np
import os
from pmaw import PushshiftAPI
def download_subreddit(subreddit, tmp_dir):
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
output_file = "%s/data.json" % tmp_dir
if not os.path.exists(output_file) or os.stat(output_file).st_size == 0:
api = PushshiftAPI(
num_workers=16, limit_type="backoff", jitter="decorr"
)
posts = api.search_submissions(
subreddit=subreddit, limit=10000000000.0, after=0
)
with open(output_file, "w") as f:
for post in posts:
line = json.dumps(post) + chr(10)
f.write(line)
def read_json_into_numpy(static_data_table_name, dataset_name, columns):
columns = json.loads(columns)
with open(
"/tmp/{dataset_name}/{static_data_table_name}/data.json".format(
dataset_name=dataset_name,
static_data_table_name=static_data_table_name,
)
) as f:
raw_records = [json.loads(l) for l in f.readlines()]
records = [[str(r[k]) for k in columns] for r in raw_records]
max_str_lens = [max([len(y) for y in x]) for x in zip(*records)]
ndarray = np.array(
records, dtype=[(x, "U%d" % l) for l, x in zip(max_str_lens, columns)]
)
return ndarray
# ## Downloading data from the Pushshift API
# Data for this particular asset(s) is located in the Pushshift API.
# We need to download it to a local directory first, before we can
# do anything with it.
#
download_json = download_subreddit(
tmp_dir="/tmp/local_subreddits/frankfurt", subreddit="frankfurt"
)
# ## Numpy from newline-delimited JSON file
# We convert an intermediary newline-delimited JSON file to
# a Numpy ndarray.
numpy_data = read_json_into_numpy(
dataset_name="local_subreddits",
columns='["id","author","subreddit","created_utc","title","selftext"]',
static_data_table_name="frankfurt",
)
Schema
Name | Type |
---|---|
id | KeyStringIdentifier |
author | StringIdentifier |
subreddit | StringIdentifier |
created_utc | POSIXTimestamp |
title | FreeText |
selftext | FreeText |
PushshiftAPILocation Storage
Param | Value |
---|---|
subreddit | frankfurt |