Code to download data. Choose a dataset, then a table, and get code to download it as Pandas or Numpy.

hub > local_subreddits > vzla

R

R code Copy

# RImports
library("glue")
library("jsonlite")
library("reticulate")
library("rjson")

download_subreddit <- '
from pmaw import PushshiftAPI
import json
import os
def download_subreddit(subreddit, tmp_dir):
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
    output_file = \'%s/data.json\' % tmp_dir
    if not os.path.exists(output_file) or os.stat(output_file).st_size == 0:
        api = PushshiftAPI(num_workers=16, limit_type=\'backoff\', jitter=
            \'decorr\')
        posts = api.search_submissions(subreddit=subreddit, limit=
            10000000000.0, after=0)
        with open(output_file, \'w\') as f:
            for post in posts:
                line = json.dumps(post) + chr(10)
                f.write(line)
'
read.json.for.table <- function(static_data_table_name, dataset_name, 
    columns) {
    columns <- fromJSON(columns)
    file.name <- glue("/tmp/{dataset_name}/{static_data_table_name}/data.json", 
        dataset_name = dataset_name, static_data_table_name = static_data_table_name)
    data <- stream_in(file(file.name))
    sprintf("Downloaded dataframe with %d rows and %d columns.", 
        nrow(data), ncol(data))
    return(data)
}
# ## Downloading data from the Pushshift API
# Data for this particular asset(s) is located in the Pushshift API.
# We need to download it to a local directory first, before we can
# do anything with it.
# 
download_json <- py_run_string(paste(download_subreddit, "download_subreddit(tmp_dir='/tmp/local_subreddits/vzla', subreddit='vzla')\n", 
    sep = "\n"))

# ## R data frame from newline-delimited JSON file
# We convert an intermediary newline-delimited JSON file to an R data frame.
r_data <- read.json.for.table(static_data_table_name = "vzla", 
    dataset_name = "local_subreddits", columns = "[\"id\",\"author\",\"subreddit\",\"created_utc\",\"title\",\"selftext\"]")

Pandas

requirements.txt Copy

- pandas==1.2.3
- pmaw==1.0.4
pip command Copy

pip3 install pandas==1.2.3 pmaw==1.0.4
Pandas code Copy

# PythonImports
import json
import os
import pandas as pd
from pmaw import PushshiftAPI


def download_subreddit(subreddit, tmp_dir):
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
    output_file = "%s/data.json" % tmp_dir
    if not os.path.exists(output_file) or os.stat(output_file).st_size == 0:
        api = PushshiftAPI(
            num_workers=16, limit_type="backoff", jitter="decorr"
        )
        posts = api.search_submissions(
            subreddit=subreddit, limit=10000000000.0, after=0
        )
        with open(output_file, "w") as f:
            for post in posts:
                line = json.dumps(post) + chr(10)
                f.write(line)


def read_json_into_pandas(static_data_table_name, dataset_name, columns):
    columns = json.loads(columns)
    with open(
        "/tmp/{dataset_name}/{static_data_table_name}/data.json".format(
            dataset_name=dataset_name,
            static_data_table_name=static_data_table_name,
        )
    ) as f:
        raw_records = [json.loads(l) for l in f.readlines()]
    records = [[r[k] for k in columns] for r in raw_records]
    return pd.DataFrame(records, columns=columns)


# ## Downloading data from the Pushshift API
# Data for this particular asset(s) is located in the Pushshift API.
# We need to download it to a local directory first, before we can
# do anything with it.
#
download_json = download_subreddit(
    tmp_dir="/tmp/local_subreddits/vzla", subreddit="vzla"
)

# ## Pandas from newline-delimited JSON file
# We convert an intermediary newline-delimited JSON file to a Pandas
# dataframe.
pandas_data = read_json_into_pandas(
    columns='["id","author","subreddit","created_utc","title","selftext"]',
    dataset_name="local_subreddits",
    static_data_table_name="vzla",
)

Numpy

requirements.txt Copy

numpy==1.20.1
pmaw==1.0.4
pip command Copy

pip3 install numpy==1.20.1 pmaw==1.0.4
Numpy code Copy

# PythonImports
import json
import numpy as np
import os
from pmaw import PushshiftAPI


def download_subreddit(subreddit, tmp_dir):
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
    output_file = "%s/data.json" % tmp_dir
    if not os.path.exists(output_file) or os.stat(output_file).st_size == 0:
        api = PushshiftAPI(
            num_workers=16, limit_type="backoff", jitter="decorr"
        )
        posts = api.search_submissions(
            subreddit=subreddit, limit=10000000000.0, after=0
        )
        with open(output_file, "w") as f:
            for post in posts:
                line = json.dumps(post) + chr(10)
                f.write(line)


def read_json_into_numpy(static_data_table_name, dataset_name, columns):
    columns = json.loads(columns)
    with open(
        "/tmp/{dataset_name}/{static_data_table_name}/data.json".format(
            dataset_name=dataset_name,
            static_data_table_name=static_data_table_name,
        )
    ) as f:
        raw_records = [json.loads(l) for l in f.readlines()]
    records = [[str(r[k]) for k in columns] for r in raw_records]
    max_str_lens = [max([len(y) for y in x]) for x in zip(*records)]
    ndarray = np.array(
        records, dtype=[(x, "U%d" % l) for l, x in zip(max_str_lens, columns)]
    )
    return ndarray


# ## Downloading data from the Pushshift API
# Data for this particular asset(s) is located in the Pushshift API.
# We need to download it to a local directory first, before we can
# do anything with it.
#
download_json = download_subreddit(
    tmp_dir="/tmp/local_subreddits/vzla", subreddit="vzla"
)

# ## Numpy from newline-delimited JSON file
# We convert an intermediary newline-delimited JSON file to
# a Numpy ndarray.
numpy_data = read_json_into_numpy(
    dataset_name="local_subreddits",
    columns='["id","author","subreddit","created_utc","title","selftext"]',
    static_data_table_name="vzla",
)

Schema

NameType
idKeyStringIdentifier
authorStringIdentifier
subredditStringIdentifier
created_utcPOSIXTimestamp
titleFreeText
selftextFreeText

PushshiftAPILocation Storage

ParamValue
subredditvzla