Schema Design¶
This document gives a detailed overview of how to read a schema script, including explations of the different components that define a pipeline.
- Goals of a schema
- Front matter
- Imports
- Schema declaration
- Table syntax
- Class inheritance
- Explicit table types
- Definitions
- Methods
- Conceptual table types
Some of this will be redundant with general Python best practices and DataJoint documentation, but it is important be able to read a schema, espically if you plan to write your own.
Later sections will depend on information presented in the article on Table Types.
Goals of a schema¶
- At its core, DataJoint is just a mapping between Python and SQL.
- SQL is a language for managing relational databases.
- DataJoint is opinionated about how to structure the database, and limits SQL's potential options in way that promotes good practices.
- Python stores ...
- A copy of table definitions, that may be out of sync with the database.
- Methods for processing data, that may be out of sync with existing data.
Good data provenance requires good version control and documentation to keep these in sync.
Example schema¶
This is the full example schema referenced in subsections below.
Full Schema
"""Schema example for custom pipelines
Note: `noqa: F401` is a comment that tells linters to ignore the fact that
`Subject` seems unused in the file. If this table is only used in a table
definition string, the linter will not recognize it as being used.
"""
import random # Package import
from typing import Union # Individual class import
from uuid import UUID
import datajoint as dj # Aliased package import
from custom_package.utils import process_df, schema_prefix # custom functions
from spyglass.common import RawPosition, Subject # noqa: F401
from spyglass.utils import SpyglassMixin # Additional Spyglass features
schema = dj.schema(schema_prefix + "_example") # schema name from string
# Model to demonstrate DataJoint syntax
@schema # Decorator to define a table in the schema on the server
class ExampleTable(SpyglassMixin, dj.Manual): # Inherit SpyglassMixin class
"""Table Description""" # Table docstring, one-line if possible
definition = """ # Table comment
primary_key1 : uuid # randomized string
primary_key2 : int # integer
---
secondary_key1 : varchar(32) # string of max length 32
-> Subject # Foreign key reference, inherit primary key of this table
"""
# Model to demonstrate field aliasing with `proj`
@schema
class SubjBlinded(SpyglassMixin, dj.Manual):
"""Blinded subject table.""" # Class docstring for `help()`
definition = """
subject_id: uuid # id
---
-> Subject.proj(actual_id='subject_id')
"""
@property # Static information, Table.property
def pk(self):
"""Return the primary key""" # Function docstring for `help()`
return self.heading.primary_key
@staticmethod # Basic func with no reference to self instance
def _subj_dict(subj_uuid: UUID): # Type hint for argument
"""Return the subject dict"""
return {"subject_id": subj_uuid}
@classmethod # Class, not instance. Table.func(), not Table().func()
def hash(cls, argument: Union[str, dict] = None): # Default value
"""Example class method"""
return dj.hash.key_hash(argument)
def blind_subjects(self, restriction: Union[str, dict]): # Union is "or"
"""Import all subjects selected by the restriction"""
insert_keys = [
{
**self._subj_dict(self.hash(key)),
"actual_id": key["subject_id"],
}
for key in (Subject & restriction).fetch("KEY")
]
self.insert(insert_keys, skip_duplicates=True)
def return_subj(self, key: str):
"""Return the entry in subject table"""
if isinstance(key, dict): # get rid of extra values
key = key["subject_id"]
key = self._subj_dict(key)
actual_ids = (self & key).fetch("actual_id")
ret = [{"subject_id": actual_id} for actual_id in actual_ids]
return ret[0] if len(ret) == 1 else ret
@schema
class MyParams(SpyglassMixin, dj.Lookup): # Lookup allows for default values
"""Parameter table."""
definition = """
param_name: varchar(32)
---
params: blob
"""
contents = [ # Default values as list of tuples
["example1", {"A": 1, "B": 2}],
["example2", {"A": 3, "B": 4}],
]
@classmethod
def insert_default(cls): # Not req for dj.Lookup, but Spyglass convention
"""Insert default values.""" # skip_duplicates prevents errors
cls().insert(rows=cls.contents, skip_duplicates=True)
@schema
class MyAnalysisSelection(SpyglassMixin, dj.Manual):
"""Selection table.""" # Pair subjects and params for computation
definition = """
-> SubjBlinded
-> MyParams
"""
def insert_all(self, param_name="example1"): # Optional helper function
"""Insert all subjects with given param name"""
self.insert(
[
{**subj_key, "param_name": param_name}
for subj_key in SubjBlinded.fetch("KEY")
],
skip_duplicates=True,
)
@schema
class MyAnalysis(SpyglassMixin, dj.Computed):
"""Analysis table."""
# One or more foreign keys, no manual input
definition = """
-> MyAnalysisSelection
"""
class MyPart(SpyglassMixin, dj.Part):
"""Part table."""
definition = """
-> MyAnalysis
---
result: int
"""
def make(self, key):
# Prepare for computation
this_subj = SubjBlinded().return_subj(key["subject_id"])
param_key = {"param_name": key["param_name"]}
these_param = (MyParams & param_key).fetch1("params")
# Perform computation.
# Ideally, all data is linked with foreign keys, but not enforced
for pos_obj in RawPosition.PosObject * (Subject & this_subj):
dataframe = (RawPosition.PosObject & pos_obj).fetch1_dataframe()
result = process_df(dataframe, **these_param)
part_inserts = [] # Prepare inserts, to minimize insert calls
for _ in range(10):
result += random.randint(0, 100)
part_inserts.append(dict(key, result=result))
self.insert1(key) # Insert into 'master' first, then all parts
self.MyPart().insert(rows=part_inserts, skip_duplicates=True)
Front matter¶
At the beginning of the schema file, you'll find ...
- Script docstring
- Imports
- Aliased imports
- Package imports
- Individual imports
- Relative imports
- Schema declaration
"""Schema example for custom pipelines
Note: `noqa: F401` is a comment that tells linters to ignore the fact that
`Subject` seems unused in the file. If this table is only used in a table
definition string, the linter will not recognize it as being used.
"""
import random # Package import
from typing import Union # Individual class import
from uuid import UUID
import datajoint as dj # Aliased package import
from custom_package.utils import process_df, schema_prefix # custom functions
from spyglass.common import RawPosition, Subject # noqa: F401
from spyglass.utils import SpyglassMixin # Additional Spyglass features
schema = dj.schema(schema_prefix + "_example") # schema name from string
- The
schema
variable determines the name of the schema in the database. - Existing schema prefixes (e.g.,
common
) should not be added to without discussion with the Spyglass team. - Database admins may be interested in limiting privileges on a per-prefix basis. For example, Frank Lab members use ...
- Their respective usernames for solo work
- Project-specific prefixes for shared work.
Table syntax¶
Each table is defined as a Python class, with a definition
attribute that
contains the SQL-like table definition.
Class inheritance¶
The parentheses in the class definition indicate that the class inherits from.
This table is ...
- A
SpyglassMixin
class, which provides a number of useful methods specific to Spyglass as discussed in the mixin article. - A DataJoint
Manual
table, which is a table that is manually populated.
@schema # Decorator to define a table in the schema on the server
class ExampleTable(SpyglassMixin, dj.Manual): # Inherit SpyglassMixin class
pass
Table types¶
- DataJoint types:
Manual
tables are manually populated.Lookup
tables can be populated on declaration, and rarely change.Computed
tables are populated by a method runs computations on upstream entries.Imported
tables are populated by a method that imports data from another source.Part
tables are used to store data that is conceptually part of another table.
- Spyglass conceptual types:
- Optional upstream Data tables from a previous pipeline.
- Parameter tables (often
dj.Lookup
) store parameters for analysis. - Selection tables store pairings of parameters and data to be analyzed.
- Compute tables (often
dj.Computed
) store the results of analysis. - Merge tables combine data from multiple pipeline versions.
Definitions¶
Each table can have a docstring that describes the table, and must have a
definition
attribute that contains the SQL-like table definition.
-
#
comments are used to describe the table and its columns. -
---
separates the primary key columns from the data columns. -
field : datatype
defines a column using a SQL datatype -
->
indicates a foreign key reference to another table.
@schema # Decorator to define a table in the schema on the server
class ExampleTable(SpyglassMixin, dj.Manual): # Inherit SpyglassMixin class
"""Table Description""" # Table docstring, one-line if possible
definition = """ # Table comment
primary_key1 : uuid # randomized string
primary_key2 : int # integer
---
secondary_key1 : varchar(32) # string of max length 32
-> Subject # Foreign key reference, inherit primary key of this table
"""
Methods¶
Many Spyglss tables have methods that provide functionality for the pipeline.
Check out our API documentation for a full list of available methods.
This example models subject blinding to demonstrate ...
- An aliased foreign key in the definition, using
proj
to rename the field. - A static property that returns the primary key.
- A static method that returns a dictionary of subject information.
- A class method that hashes an argument.
- An instance method that self-inserts subjects based on a restriction.
- An instance method that returns the unblinded subject information.
# Model to demonstrate field aliasing with `proj`
@schema
class SubjBlinded(SpyglassMixin, dj.Manual):
"""Blinded subject table.""" # Class docstring for `help()`
definition = """
subject_id: uuid # id
---
-> Subject.proj(actual_id='subject_id')
"""
@property # Static information, Table.property
def pk(self):
"""Return the primary key""" # Function docstring for `help()`
return self.heading.primary_key
@staticmethod # Basic func with no reference to self instance
def _subj_dict(subj_uuid: UUID): # Type hint for argument
"""Return the subject dict"""
return {"subject_id": subj_uuid}
@classmethod # Class, not instance. Table.func(), not Table().func()
def hash(cls, argument: Union[str, dict] = None): # Default value
"""Example class method"""
return dj.hash.key_hash(argument)
def blind_subjects(self, restriction: Union[str, dict]): # Union is "or"
"""Import all subjects selected by the restriction"""
insert_keys = [
{
**self._subj_dict(self.hash(key)),
"actual_id": key["subject_id"],
}
for key in (Subject & restriction).fetch("KEY")
]
self.insert(insert_keys, skip_duplicates=True)
def return_subj(self, key: str):
"""Return the entry in subject table"""
if isinstance(key, dict): # get rid of extra values
key = key["subject_id"]
key = self._subj_dict(key)
actual_ids = (self & key).fetch("actual_id")
ret = [{"subject_id": actual_id} for actual_id in actual_ids]
return ret[0] if len(ret) == 1 else ret
Example Table Types¶
Params Table¶
This stores the set of values that may be used in an analysis. For analyses that
are unlikely to change, consider specifying all parameters in the table's
secondary keys. For analyses that may have different parameters, of when
depending on outside packages, consider a blob
datatype that can store a
python dictionary.
@schema
class MyParams(SpyglassMixin, dj.Lookup): # Lookup allows for default values
"""Parameter table."""
definition = """
param_name: varchar(32)
---
params: blob
"""
contents = [ # Default values as list of tuples
["example1", {"A": 1, "B": 2}],
["example2", {"A": 3, "B": 4}],
]
@classmethod
def insert_default(cls): # Not req for dj.Lookup, but Spyglass convention
"""Insert default values.""" # skip_duplicates prevents errors
cls().insert(rows=cls.contents, skip_duplicates=True)
Selection Table¶
This is the staging area to pair sessions with parameter sets. Depending on what is inserted, you might pair the same subject with different parameter sets, or different subjects with the same parameter set.
@schema
class MyAnalysisSelection(SpyglassMixin, dj.Manual):
"""Selection table.""" # Pair subjects and params for computation
definition = """
-> SubjBlinded
-> MyParams
"""
def insert_all(self, param_name="example1"): # Optional helper function
"""Insert all subjects with given param name"""
self.insert(
[
{**subj_key, "param_name": param_name}
for subj_key in SubjBlinded.fetch("KEY")
],
skip_duplicates=True,
)
Compute Table¶
This is how processing steps are paired with data entry. By running
MyAnalysis().populate()
, the make
method is called for each foreign key
pairing in the selection table. The make
method should end in one or one
inserts into the compute table.
@schema
class MyAnalysis(SpyglassMixin, dj.Computed):
"""Analysis table."""
# One or more foreign keys, no manual input
definition = """
-> MyAnalysisSelection
"""
class MyPart(SpyglassMixin, dj.Part):
"""Part table."""
definition = """
-> MyAnalysis
---
result: int
"""
def make(self, key):
# Prepare for computation
this_subj = SubjBlinded().return_subj(key["subject_id"])
param_key = {"param_name": key["param_name"]}
these_param = (MyParams & param_key).fetch1("params")
# Perform computation.
# Ideally, all data is linked with foreign keys, but not enforced
for pos_obj in RawPosition.PosObject * (Subject & this_subj):
dataframe = (RawPosition.PosObject & pos_obj).fetch1_dataframe()
result = process_df(dataframe, **these_param)
part_inserts = [] # Prepare inserts, to minimize insert calls
for _ in range(10):
result += random.randint(0, 100)
part_inserts.append(dict(key, result=result))
self.insert1(key) # Insert into 'master' first, then all parts
self.MyPart().insert(rows=part_inserts, skip_duplicates=True)
To see how tables of a given schema relate to one another, use a schema diagram