Code to download data. Choose a dataset, then a table, and get code to download it as Pandas or Numpy.

hub > snap

Datasets from the Stanford Network Analysis Platform, represented as undirected graphs. Please see additional metadata at SNAP for dataset semantics and citation instructions.

Tables

NameType
amazon0302StaticDataTable
amazon0312StaticDataTable
amazon0505StaticDataTable
amazon0601StaticDataTable
ca_astrophStaticDataTable
ca_condmatStaticDataTable
ca_grqcStaticDataTable
ca_hepphStaticDataTable
ca_hepthStaticDataTable
cit_hepphStaticDataTable
cit_hepthStaticDataTable
cit_patentsStaticDataTable
p2p_gnutella04StaticDataTable
p2p_gnutella05StaticDataTable
p2p_gnutella06StaticDataTable
p2p_gnutella08StaticDataTable
p2p_gnutella09StaticDataTable
p2p_gnutella24StaticDataTable
p2p_gnutella25StaticDataTable
p2p_gnutella30StaticDataTable
p2p_gnutella31StaticDataTable
roadnet_caStaticDataTable
roadnet_paStaticDataTable
roadnet_txStaticDataTable
soc_epinions1StaticDataTable
soc_livejournal1StaticDataTable
soc_slashdot0811StaticDataTable
soc_slashdot0922StaticDataTable
web_berkstanStaticDataTable
web_finefoodsStaticDataTable
web_googleStaticDataTable
web_notredameStaticDataTable
web_stanfordStaticDataTable
wiki_elecStaticDataTable
wiki_rfaStaticDataTable
wiki_talkStaticDataTable
wiki_voteStaticDataTable
wiki_hoaxesStaticDataTable

Schemas

NameType
edgeIdentifierTuple

Source specification


from aorist import (
    MinioLocation,
    WebLocation,
    StaticTabularLayout,
    CSVHeader,
    GzipCompression,
    ORCEncoding,
    TSVEncoding,
    SingleFileLayout,
    RemoteStorage,
    HiveTableStorage,
    RemoteStorageSetup,
    StaticDataTable,
    default_tabular_schema,
    DataSet,
    IdentifierTuple,
    attr_list,
)

# hacky import since submodule imports don't work well
from aorist import attributes as attr

edge_tuple = IdentifierTuple(
    name="edge",
    attributes=attr_list([
        attr.NumericIdentifier("from_id"),
        attr.NumericIdentifier("to_id"),
    ]),
)
# TODO: add:
# soc-Pokect, comm-f2f-Resistance, act-mooc, feather*
# gemsec*, musae*, soc-sign-bitcoin*
names = [
    # social networks
    "soc-Epinions1",
    "soc-LiveJournal1",
    "soc-Slashdot0811",
    "soc-Slashdot0922",
    # citation networks
    "cit-HepPh",
    "cit-HepTh",
    "cit-Patents",
    # collaboration networks
    "ca-AstroPh",
    "ca-CondMat",
    "ca-GrQc",
    "ca-HepPh",
    "ca-HepTh",
    # web networks
    "web-BerkStan",
    "web-Google",
    "web-NotreDame",
    "web-Stanford",
    # co-purchasing
    "amazon0302",
    "amazon0312",
    "amazon0505",
    "amazon0601",
    # internet p2p
    "p2p-Gnutella04",
    "p2p-Gnutella05",
    "p2p-Gnutella06",
    "p2p-Gnutella08",
    "p2p-Gnutella09",
    "p2p-Gnutella24",
    "p2p-Gnutella25",
    "p2p-Gnutella30",
    "p2p-Gnutella31",
    # road networks
    "roadNet-CA",
    "roadNet-PA",
    "roadNet-TX",
    # Wikipedia
    "wiki-Vote",
    "wiki-Talk",
    "wiki-Elec",
    "wiki-RfA",
    "wiki-hoaxes",
    "web-FineFoods",
]
tables = {}
for name in names:

    name_underscore = name.replace("-", "_").lower()
    remote = RemoteStorage(
        location=WebLocation(
            address="https://snap.stanford.edu/data/%s.txt.gz" % name,
        ),
        layout=SingleFileLayout(),
        encoding=TSVEncoding(
            compression=GzipCompression(),
            header=CSVHeader(num_lines=4),
        ),
    )
    local = HiveTableStorage(
        location=MinioLocation(name=name_underscore),
        layout=StaticTabularLayout(),
        encoding=ORCEncoding(),
    )
    table = StaticDataTable(
        name=name_underscore,
        schema=default_tabular_schema(edge_tuple),
        setup=RemoteStorageSetup(
            remote=remote,
        ),
        tag=name_underscore,
    )
    tables[name] = table

snap_dataset = DataSet(
    name="snap",
    description=(
        "Datasets from the [Stanford Network Analysis Platform]"
        "(http://snap.stanford.edu/data/index.html), represented as "
        "undirected graphs. Please see additional metadata at SNAP "
        "for dataset semantics and citation instructions."
    ),
    sourcePath=__file__,
    datumTemplates=[edge_tuple],
    assets=tables,
    tag="snap",
)