Code to download data. Choose a dataset, then a table, and get code to download it as Pandas or Numpy.

hub > snap > ca_hepph

R

R code Copy

# RImports
library("glue")
library("reticulate")
library("rjson")

download.data.from.remote.web.location <- function(src_url, tmp_dir, 
    dest_file_name, table_name, dataset_name) {
    path <- paste(tmp_dir, dataset_name, table_name, sep = "/")
    if (!dir.exists(path)) {
        dir.create(path, recursive = TRUE)
    }
    dest <- paste(path, dest_file_name, sep = "/")
    download.file(src_url, dest)
}
tsv_to_csv <- '
import os.path
def tsv_to_csv(tmp_dir, src_file_name, dest_file_name, dataset_name, table_name
    ):
    source = os.path.join(tmp_dir, dataset_name, table_name, src_file_name)
    dest = os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
    with open(source) as sourcef, open(dest, \'w\') as destf:
        while True:
            chunk = sourcef.read(10240)
            if not chunk:
                destf.flush()
                destf.close()
                break
            destf.write(chunk.replace(\'\t\', \',\'))
'
read.csv.for.table <- function(static_data_table_name, dataset_name, 
    columns, tmp_dir) {
    columns <- fromJSON(columns)
    data <- read.csv(glue("{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv", 
        tmp_dir = tmp_dir, dataset_name = dataset_name, static_data_table_name = static_data_table_name))
    colnames(data) <- columns
    sprintf("Downloaded dataframe with %d rows and %d columns.", 
        nrow(data), ncol(data))
    return(data)
}
# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location <- download.data.from.remote.web.location(table_name = "ca_hepph", 
    dataset_name = "snap", src_url = "https://snap.stanford.edu/data/ca-HepPh.txt.gz", 
    dest_file_name = "ca_hepph.downloaded", tmp_dir = "/tmp/snap/ca_hepph")

# ## Decompressing Gzip file
# Data for this particular asset(s) is compressed with the GZIP
# algorith. Before we can process it further we need to decompress it.
decompress_gzip <- system(glue("gunzip {command}", command = "--suffix=downloaded -c /tmp/snap/ca_hepph/ca_hepph.downloaded > /tmp/snap/ca_hepph/ca_hepph.tmp && mv /tmp/snap/ca_hepph/ca_hepph.tmp /tmp/snap/ca_hepph/ca_hepph.downloaded"))

# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
remove_header <- system(glue("tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}", 
    n = "5", file_name = "ca_hepph.downloaded", table_name = "ca_hepph", 
    tmp_dir = "/tmp/snap/ca_hepph"))

# ## Convert TSV data to CSV
# We need to convert the TSV data to CSV
# format to process it further.
convert_csv <- py_run_string(paste(tsv_to_csv, "tsv_to_csv(table_name='ca_hepph', src_file_name='ca_hepph.downloaded',\n    tmp_dir='/tmp/snap/ca_hepph', dataset_name='snap', dest_file_name=\n    'ca_hepph.csv')\n", 
    sep = "\n"))

# ## Create an R data frame from CSV data
# We convert an intermediary CSV file to an R dataframe.
r_data <- read.csv.for.table(static_data_table_name = "ca_hepph", 
    tmp_dir = "/tmp/snap/ca_hepph", columns = "[\"from_id\",\"to_id\"]", 
    dataset_name = "snap")

Pandas

requirements.txt Copy

- pandas==1.2.3
pip command Copy

pip3 install pandas==1.2.3
Pandas code Copy

# PythonImports
import json
import os
import os.path
import pandas as pd
import subprocess
import urllib.request


def download_data_from_remote_web_location(
    src_url, tmp_dir, dest_file_name, table_name, dataset_name
):
    os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
    urllib.request.urlretrieve(
        src_url, os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
    )


def tsv_to_csv(
    tmp_dir, src_file_name, dest_file_name, dataset_name, table_name
):
    source = os.path.join(tmp_dir, dataset_name, table_name, src_file_name)
    dest = os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
    with open(source) as sourcef, open(dest, "w") as destf:
        while True:
            chunk = sourcef.read(10240)
            if not chunk:
                destf.flush()
                destf.close()
                break
            destf.write(chunk.replace("\t", ","))


def read_csv_into_pandas(
    static_data_table_name, dataset_name, columns, tmp_dir
):
    columns = json.loads(columns)
    data = pd.read_csv(
        "{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
            tmp_dir=tmp_dir,
            dataset_name=dataset_name,
            static_data_table_name=static_data_table_name,
        ),
        header=None,
    )
    data.columns = columns
    print("Downloaded dataframe with %d rows and %d columns." % data.shape)
    return data


# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location = download_data_from_remote_web_location(
    src_url="https://snap.stanford.edu/data/ca-HepPh.txt.gz",
    dest_file_name="ca_hepph.downloaded",
    dataset_name="snap",
    table_name="ca_hepph",
    tmp_dir="/tmp/snap/ca_hepph",
)

# ## Decompressing Gzip file
# Data for this particular asset(s) is compressed with the GZIP
# algorith. Before we can process it further we need to decompress it.
process = subprocess.Popen(
    "gunzip {command}".format(
        command="--suffix=downloaded -c /tmp/snap/ca_hepph/ca_hepph.downloaded > /tmp/snap/ca_hepph/ca_hepph.tmp && mv /tmp/snap/ca_hepph/ca_hepph.tmp /tmp/snap/ca_hepph/ca_hepph.downloaded"
    ),
    stdout=subprocess.PIPE,
    shell=True,
)
decompress_gzip, error = process.communicate()

# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
    "tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
        n="5",
        file_name="ca_hepph.downloaded",
        table_name="ca_hepph",
        tmp_dir="/tmp/snap/ca_hepph",
    ),
    stdout=subprocess.PIPE,
    shell=True,
)
remove_header, error = process.communicate()

# ## Convert TSV data to CSV
# We need to convert the TSV data to CSV
# format to process it further.
convert_csv = tsv_to_csv(
    table_name="ca_hepph",
    src_file_name="ca_hepph.downloaded",
    tmp_dir="/tmp/snap/ca_hepph",
    dataset_name="snap",
    dest_file_name="ca_hepph.csv",
)

# ## Create Pandas Dataframe from CSV data
# We convert an intermediary CSV file to a Pandas dataframe.
pandas_data = read_csv_into_pandas(
    columns='["from_id","to_id"]',
    static_data_table_name="ca_hepph",
    tmp_dir="/tmp/snap/ca_hepph",
    dataset_name="snap",
)

Numpy

requirements.txt Copy

numpy==1.20.1
pip command Copy

pip3 install numpy==1.20.1
Numpy code Copy

# PythonImports
import csv
import json
import numpy as np
import os
import os.path
import subprocess
import urllib.request


def download_data_from_remote_web_location(
    src_url, tmp_dir, dest_file_name, table_name, dataset_name
):
    os.makedirs(tmp_dir + "/" + dataset_name + "/" + table_name, exist_ok=True)
    urllib.request.urlretrieve(
        src_url, os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
    )


def tsv_to_csv(
    tmp_dir, src_file_name, dest_file_name, dataset_name, table_name
):
    source = os.path.join(tmp_dir, dataset_name, table_name, src_file_name)
    dest = os.path.join(tmp_dir, dataset_name, table_name, dest_file_name)
    with open(source) as sourcef, open(dest, "w") as destf:
        while True:
            chunk = sourcef.read(10240)
            if not chunk:
                destf.flush()
                destf.close()
                break
            destf.write(chunk.replace("\t", ","))


def read_csv_into_numpy(static_data_table_name, dataset_name, columns, tmp_dir):
    columns = json.loads(columns)
    file_name = "{tmp_dir}/{dataset_name}/{static_data_table_name}/{static_data_table_name}.csv".format(
        dataset_name=dataset_name,
        static_data_table_name=static_data_table_name,
        tmp_dir=tmp_dir,
    )
    escaped = (
        "\t".join([i.replace("\t", " ") for i in x])
        for x in csv.reader(open(file_name))
    )
    data = np.genfromtxt(escaped, delimiter="\t", names=columns, dtype=None)
    assert data is not None
    print("Downloaded ndarray with %d rows." % data.shape[0])
    return data


# ## Downloading data from remote web location
# Data for this particular asset(s) is located somewhere on the web.
# We need to download it to a local directory first, before we can
# do anything with it.
download_location = download_data_from_remote_web_location(
    src_url="https://snap.stanford.edu/data/ca-HepPh.txt.gz",
    dest_file_name="ca_hepph.downloaded",
    dataset_name="snap",
    table_name="ca_hepph",
    tmp_dir="/tmp/snap/ca_hepph",
)

# ## Decompressing Gzip file
# Data for this particular asset(s) is compressed with the GZIP
# algorith. Before we can process it further we need to decompress it.
process = subprocess.Popen(
    "gunzip {command}".format(
        command="--suffix=downloaded -c /tmp/snap/ca_hepph/ca_hepph.downloaded > /tmp/snap/ca_hepph/ca_hepph.tmp && mv /tmp/snap/ca_hepph/ca_hepph.tmp /tmp/snap/ca_hepph/ca_hepph.downloaded"
    ),
    stdout=subprocess.PIPE,
    shell=True,
)
decompress_gzip, error = process.communicate()

# ## Removing file header
# We are dealing with a tabular file with a header. Before we can
# process it we need to remove the header.
process = subprocess.Popen(
    "tail -n +{n} {tmp_dir}/{table_name}.downloaded > {tmp_dir}/{file_name}.tmp && mv {tmp_dir}/{file_name}.tmp {tmp_dir}/{file_name}".format(
        n="5",
        file_name="ca_hepph.downloaded",
        table_name="ca_hepph",
        tmp_dir="/tmp/snap/ca_hepph",
    ),
    stdout=subprocess.PIPE,
    shell=True,
)
remove_header, error = process.communicate()

# ## Convert TSV data to CSV
# We need to convert the TSV data to CSV
# format to process it further.
convert_csv = tsv_to_csv(
    table_name="ca_hepph",
    src_file_name="ca_hepph.downloaded",
    tmp_dir="/tmp/snap/ca_hepph",
    dataset_name="snap",
    dest_file_name="ca_hepph.csv",
)

# ## Create Numpy ndarray from CSV data
# We convert an intermediary CSV file to a Numpy ndarray.
numpy_data = read_csv_into_numpy(
    dataset_name="snap",
    tmp_dir="/tmp/snap/ca_hepph",
    columns='["from_id","to_id"]',
    static_data_table_name="ca_hepph",
)

Schema

NameType
from_idNumericIdentifier
to_idNumericIdentifier

WebLocation Storage

ParamValue
addresshttps://snap.stanford.edu/data/ca-HepPh.txt.gz