Source code for brainscopypaste.db

"""Database models and related utilities.

This module defines the database structure underlying storage for the analysis.
This consists in models that get turned into PostgreSQL tables by `SQLAlchemy
<http://www.sqlalchemy.org/>`_, along with a few utility classes, functions and
exceptions around them.

:class:`Cluster` and :class:`Quote` represent respectively an individual
cluster or quote from the MemeTracker data set. :class:`Url` represents a quote
occurrence, and those are stored as attributes of :class:`Quote`\ s (as opposed
to in their own table). :class:`Substitution` represents an individual
substitution mined with a given substitution :class:`~.mine.Model`.

Each model (except :class:`Url`, which doesn't have its own table) inherits the
:class:`BaseMixin`, which defines the table name, `id` field, and provides a
common :meth:`~BaseMixin.clone` method.

On top of that, models define a few computed properties (using the
:class:`.utils.cache` decorator) which provide useful information that doesn't
need to be stored directly in the database (storing that in the database would
make first access faster, but introduces more possibilities of inconsistent
data if updates don't align well). :class:`Cluster` and :class:`Substitution`
also inherit functionality from the :mod:`.mine`, :mod:`.filter` and
:mod:`.features` modules, which you can inspect for more details.

Finally, this module defines :func:`save_by_copy`, a useful function to
efficiently import clusters and quotes in bulk into the database.

"""


import re
from io import StringIO
import logging

import click
from sqlalchemy import Column, Integer, String, Boolean, ForeignKey, cast
from sqlalchemy.orm import relationship, sessionmaker
from sqlalchemy.ext.declarative import declarative_base, declared_attr
from sqlalchemy.types import DateTime, Enum, TypeDecorator
from sqlalchemy.dialects.postgresql import ARRAY

from brainscopypaste.utils import cache, session_scope
from brainscopypaste.filter import ClusterFilterMixin
from brainscopypaste.mine import (SubstitutionValidatorMixin,
                                  ClusterMinerMixin, Model, Time, Source,
                                  Past, Durl)
from brainscopypaste.features import SubstitutionFeaturesMixin


logger = logging.getLogger(__name__)
Base = declarative_base()
Session = sessionmaker()


[docs]class BaseMixin: """Common mixin for all models defining a table name, an `id` field, and a `clone()` method.""" @declared_attr def __tablename__(cls): """Compute the table name from the class name.""" return cls.__name__.lower() #: Primary key for the table. id = Column(Integer, primary_key=True)
[docs] def clone(self, **fields): """Clone a model instance, excluding the original `id` and optionally setting some fields to values provided as arguments. Give the fields to override as keyword arguments, their values will be set on the cloned instance. Any field that is not a known table column is ignored. """ columns = self.__mapper__.columns.keys() columns.remove('id') for field in fields.keys(): try: columns.remove(field) except ValueError: pass init = {} for column in columns: init[column] = getattr(self, column) for arg, value in fields.items(): init[arg] = value return self.__class__(**init)
[docs]class Cluster(Base, BaseMixin, ClusterFilterMixin, ClusterMinerMixin): """Represent a MemeTracker cluster of quotes in the database. Attributes below are defined as class attributes or :class:`~.utils.cache`\ d methods, but they appear as instance attributes when you have an actual cluster instance. For instance, if `cluster` is a :class:`Cluster` instance, `cluster.size` will give you that instance's :attr:`size`. See Also -------- .filter.ClusterFilterMixin, .mine.ClusterMinerMixin """ #: Id of the cluster that originated this instance, i.e. the id as it #: appears in the MemeTracker data set. sid = Column(Integer, nullable=False) #: Boolean indicating whether this cluster is part of the filtered (and #: kept) set of clusters or not. filtered = Column(Boolean, default=False, nullable=False) #: Source data set from which this cluster originated. Currently this is #: always `memetracker`. source = Column(String, nullable=False) #: List of :class:`Quote`\ s in this cluster (this is a dynamic #: relationship on which you can run queries). quotes = relationship('Quote', back_populates='cluster', lazy='dynamic', cascade='all, delete-orphan', passive_deletes=True) #: Tuple of column names that are used by :meth:`format_copy`. format_copy_columns = ('id', 'sid', 'filtered', 'source')
[docs] def format_copy(self): """Create a string representing the cluster in a :meth:`cursor.copy_from` or :func:`_copy` call.""" base = ('{cluster.id}\t{cluster.sid}\t{cluster.filtered}\t' '{cluster.source}') return base.format(cluster=self)
@cache def size(self): """Number of quotes in the cluster.""" return self.quotes.count() @cache def size_urls(self): """Number of urls of all the quotes in the cluster (i.e. not counting url frequencies) Look at :attr:`frequency` for a count that takes url frequencies into account. """ return sum(quote.size for quote in self.quotes.all()) @cache def frequency(self): """Complete number of occurrences of all the quotes in the cluster (i.e. counting url frequencies). Look at :attr:`size_urls` for a count that ignores url frequencies. """ return sum(url.frequency for url in self.urls) @cache def urls(self): """Unordered list of :class:`Url`\ s of all the quotes in the cluster.""" urls = [] for quote in self.quotes.all(): urls.extend(quote.urls) return sorted(urls, key=lambda url: url.timestamp) @cache def span(self): """Span of the cluster (as a :class:`~datetime.timedelta`), from first to last occurrence. Raises ------ ValueError If no urls are defined on any quotes of the cluster. """ if self.size_urls == 0: raise ValueError('No urls defined on any quotes of this cluster ' "yet, span doesn't make sense.") timestamps = [] for quote in self.quotes.all(): timestamps.extend(quote.url_timestamps) return abs(max(timestamps) - min(timestamps))
[docs]class ArrayOfEnum(ARRAY): """ARRAY of ENUMs column type, which is not directly supported by DBAPIs. This workaround is provided by `SQLAchemy's documentation <http://docs.sqlalchemy.org/en/rel_1_0/dialects/postgresql.html#using-enum-with-array>`_. """ def bind_expression(self, bindvalue): return cast(bindvalue, self) def result_processor(self, dialect, coltype): super_rp = super(ArrayOfEnum, self).result_processor( dialect, coltype) def handle_raw_string(value): inner = re.match(r"^{(.*)}$", value).group(1) return inner.split(",") if inner else [] def process(value): if value is None: return None return super_rp(handle_raw_string(value)) return process
#: :class:`sqlalchemy.types.Enum` of possible types of :class:`Url`\ s from #: the MemeTracker data set. url_type = Enum('B', 'M', name='url_type', metadata=Base.metadata)
[docs]class SealedException(Exception): """Exception raised when trying to edit a model on which :class:`~.utils.cache`\ d methods have already been accessed."""
[docs]class Quote(Base, BaseMixin): """Represent a MemeTracker quote in the database. Attributes below are defined as class attributes or :class:`~.utils.cache`\ d methods, but they appear as instance attributes when you have an actual quote instance. For instance, if `quote` is a :class:`Quote` instance, `quote.size` will give you that instance's :attr:`size`. Note that children :class:`Url`\ s are stored directly inside this model through lists of url attributes, where a given url is defined by items at the same index in the various lists. This is an internal detail, and you should use the :attr:`urls` attribute to directly get a list of :class:`Url` objects. """ #: Parent cluster id. cluster_id = Column(Integer, ForeignKey('cluster.id', ondelete='CASCADE'), nullable=False) #: Parent :class:`Cluster`. cluster = relationship('Cluster', back_populates='quotes') #: Id of the quote that originated this instance, i.e. the id as it #: appears in the MemeTracker data set. sid = Column(Integer, nullable=False) #: Boolean indicating whether this quote is part of the filtered (and kept) #: set of quotes or not. filtered = Column(Boolean, default=False, nullable=False) #: Text of the quote. string = Column(String, nullable=False) #: List of :class:`~datetime.datetime`\ s representing the times at which #: children urls appear. url_timestamps = Column(ARRAY(DateTime), default=[], nullable=False) #: List of `int`\ s representing the frequencies of children urls (i.e. how #: many times the quote string appears at each url). url_frequencies = Column(ARRAY(Integer), default=[], nullable=False) #: List of :data:`url_type`\ s representing the types of the children urls. url_url_types = Column(ArrayOfEnum(url_type), default=[], nullable=False) #: List of `str`\ s representing the URIs of the children urls. url_urls = Column(ARRAY(String), default=[], nullable=False) #: List of :class:`Substitution`\ s for which this quote is the source #: (this is a dynamic relationship on which you can run queries). substitutions_source = relationship( 'Substitution', back_populates='source', lazy='dynamic', foreign_keys='Substitution.source_id', cascade='all, delete-orphan', passive_deletes=True) #: List of :class:`Substitution`\ s for which this quote is the destination #: (this is a dynamic relationship on which you can run queries). substitutions_destination = relationship( 'Substitution', back_populates='destination', lazy='dynamic', foreign_keys='Substitution.destination_id', cascade='all, delete-orphan', passive_deletes=True) #: Tuple of column names that are used by :meth:`format_copy`. format_copy_columns = ('id', 'cluster_id', 'sid', 'filtered', 'string', 'url_timestamps', 'url_frequencies', 'url_url_types', 'url_urls')
[docs] def format_copy(self): """Create a string representing the quote and all its children urls in a :meth:`cursor.copy_from` or :func:`_copy` call.""" base = '{quote.id}\t{quote.cluster_id}\t{quote.sid}\t{quote.filtered}' parts = [base.format(quote=self)] # Backslashes must be escaped otherwise PostgreSQL interprets them # in its own way (see # http://www.postgresql.org/docs/current/interactive/sql-copy.html). parts.append(self.string.replace('\\', '\\\\')) timestamps = [url.timestamp for url in self.urls] frequencies = [url.frequency for url in self.urls] url_types = [url.url_type for url in self.urls] parts.append('{' + ', '.join(map('{}'.format, timestamps)) + '}') parts.append('{' + ', '.join(map('{}'.format, frequencies)) + '}') parts.append('{' + ', '.join(map('{}'.format, url_types)) + '}') # Two levels of escaping backslashes and double quotes too here. # (See http://www.postgresql.org/docs/9.5/static/arrays.html). urls = [url.url.replace('\\', '\\\\').replace('"', '\\"') for url in self.urls] parts.append( "{" + ', '.join(map('"{}"'.format, urls)).replace('\\', '\\\\') + "}" ) return '\t'.join(parts)
@cache def size(self): """Number of urls in the quote. Look at :attr:`frequency` for a count that takes url frequencies into account. """ if self.url_timestamps is None: return 0 return len(self.url_timestamps) @cache def frequency(self): """Complete number of occurrences of the quote (i.e. counting url frequencies). Look at :attr:`size` for a count that ignores url frequencies. """ if self.size == 0: return 0 return sum(self.url_frequencies) @cache def span(self): """Span of the quote (as a :class:`~datetime.timedelta`), from first to last occurrence. Raises ------ ValueError If no urls are defined on the quote. """ if self.size == 0: raise ValueError('No urls defined on this quote yet, ' "span doesn't make sense.") timestamps = self.url_timestamps return abs(max(timestamps) - min(timestamps)) @cache def tags(self): """List of TreeTagger POS tags of the tokens in the quote's :attr:`string`. Raises ------ ValueError If the quote's :attr:`string` is `None`. """ if self.string is None: raise ValueError('No string defined on this quote yet, ' "tags doesn't make sense.") from brainscopypaste import tagger return tagger.tags(self.string) @cache def tokens(self): """List of the tokens in the quote's :attr:`string`. Raises ------ ValueError If the quote's :attr:`string` is `None`. """ if self.string is None: raise ValueError('No string defined on this quote yet, ' "tokens doesn't make sense.") from brainscopypaste import tagger return tagger.tokens(self.string) @cache def lemmas(self): """List of the lemmas in the quote's :attr:`string`. Raises ------ ValueError If the quote's :attr:`string` is `None`. """ if self.string is None: raise ValueError('No string defined on this quote yet, ' "lemmas doesn't make sense.") from brainscopypaste import tagger return tagger.lemmas(self.string) @cache def urls(self): """Unordered list of :class:`Url`\ s of the quote; use this to access urls of the quote, instead of the `url_*` attributes.""" if self.size == 0: return [] return sorted([Url(timestamp, frequency, url_type, url, quote=self) for (timestamp, frequency, url_type, url) in zip(self.url_timestamps, self.url_frequencies, self.url_url_types, self.url_urls)], key=lambda url: url.timestamp)
[docs] def add_url(self, url): """Add a :class:`Url` to the quote. The change is not automatically saved. If you want to persist this to the database, you should do it inside a session and commit afterwards (e.g. using :func:`~.utils.session_scope`). Parameters ---------- url : :class:`Url` The url to add to the quote. Raises ------ SealedException If the :attr:`urls` attribute has already been accessed; since that attribute is :class:`~.utils.cache`\ d, adding a url to the quote would invalidate the value. """ if 'urls' in self.__dict__: raise SealedException('self.urls has already been accessed, ' 'cannot add more urls') self.url_timestamps = (self.url_timestamps or []) + [url.timestamp] self.url_frequencies = (self.url_frequencies or []) + [url.frequency] self.url_url_types = (self.url_url_types or []) + [url.url_type] self.url_urls = (self.url_urls or []) + [url.url]
[docs] def add_urls(self, urls): """Add a list of :class:`Url`\ s to the quote. As for :meth:`add_url`, the changes are not automatically saved. If you want to persist this to the database, you should do it inside a session and commit afterwards (e.g. using :func:`~.utils.session_scope`). Parameters ---------- urls : list of :class:`Url`\ s The urls to add to the quote. Raises ------ SealedException If the :attr:`urls` attribute has already been accessed; since that attribute is :class:`~.utils.cache`\ d, adding urls to the quote would invalidate the value. """ for url in urls: self.add_url(url)
[docs]class Url: """Represent a MemeTracker url in a :class:`Quote` in the database. The url :attr:`occurrence` is defined below as a :class:`~.utils.cache`\ d method, but it appears as an instance attribute when you have an actual url instance. For instance, if `url` is a :class:`Url` instance, `url.occurrence` will give you that url's :attr:`occurrence`. Note that :class:`Url`\ s are stored directly inside :class:`Quote` instances, and don't have a dedicated database table. Attributes ---------- quote : :class:`Quote` Parent quote. timestamp : :class:`~datetime.datetime` Time at which the url occurred. frequency : int Number of times the quote string appears at this url. url_type : :data:`url_type` Type of this url. url : str URI of this url. """ def __init__(self, timestamp, frequency, url_type, url, quote=None): self.quote = quote self.timestamp = timestamp self.frequency = frequency self.url_type = url_type self.url = url def __key(self): """Unique identifier for this url, used to compute e.g. equality between two :class:`Url` instances.""" return (self.quote, self.timestamp, self.frequency, self.url_type, self.url) def __eq__(self, other): """Determine if two instances represent the same url (underlies e.g. ``url1 == url2``)""" return self.__key() == other.__key() def __hash__(self): """Hash for this url (makes this class hashable, so usable e.g. as dict keys).""" return hash(self.__key()) @cache def occurrence(self): """Index of the url in the list of urls of the parent :class:`Quote`. Raises ------ ValueError If the url's `quote` attribute is `None`. """ if self.quote is None: raise ValueError('No quote defined on this Url') return self.quote.urls.index(self)
[docs]class ModelType(TypeDecorator): """Database type representing a substitution :class:`~.mine.Model`, used in the definition of :class:`Substitution`.""" # Store models as strings in the database. impl = String
[docs] def process_bind_param(self, value, dialect): """Convert a :class:`~.mine.Model` to its database representation.""" return str(value)
[docs] def process_result_value(self, value, dialect): """Create a :class:`~.mine.Model` instance from its database representation.""" return eval(value, globals(), {'Model': Model, 'Source': Source, 'Time': Time, 'Durl': Durl, 'Past': Past})
[docs]class Substitution(Base, BaseMixin, SubstitutionValidatorMixin, SubstitutionFeaturesMixin): """Represent a substitution in the database from one :class:`Quote` to another. A substitution is the replacement of a word from one quote (or a substring of that quote) in another quote. It is defined by a :attr:`source quote <source>`, an :attr:`occurrence` of a :attr:`destination quote <destination>`, the :attr:`position of a substring <start>` in the source quote string, the :attr:`position of the replaced word <position>` in that substring, and the :attr:`substitution model <model>` that detected the substitution in the data set. Attributes below are defined as class attributes or :class:`~.utils.cache`\ d methods, but they appear as instance attributes when you have an actual substitution instance. For instance, if `substitution` is a :class:`Substitution` instance, `substitution.tags` will give you that instance's :attr:`tags`. See Also -------- .mine.SubstitutionValidatorMixin, .features.SubstitutionFeaturesMixin """ #: Id of the source quote for the substitution. source_id = Column(Integer, ForeignKey('quote.id', ondelete='CASCADE'), nullable=False) #: Source :class:`Quote` for the substitution. source = relationship('Quote', back_populates='substitutions_source', foreign_keys='Substitution.source_id') #: Id of the destination quote for the substitution. destination_id = Column(Integer, ForeignKey('quote.id', ondelete='CASCADE'), nullable=False) #: Destination :class:`Quote` for the substitution. destination = relationship('Quote', back_populates='substitutions_destination', foreign_keys='Substitution.destination_id') #: Index of the destination :class:`Url` in the destination quote. occurrence = Column(Integer, nullable=False) #: Index of the beginning of the substring in the source quote. start = Column(Integer, nullable=False) #: Position of the replaced word *in the substring of the source quote* #: (which is also the position in the destination quote). position = Column(Integer, nullable=False) #: Substitution detection :class:`~.mine.Model` that detected this #: substitution. model = Column(ModelType, nullable=False) @cache def tags(self): """Tuple of TreeTagger POS tags of the replaced and replacing words.""" return (self.source.tags[self.start + self.position], self.destination.tags[self.position]) @cache def tokens(self): """Tuple of the replaced and replacing words (the tokens here are the exact replaced and replacing words).""" return (self.source.tokens[self.start + self.position], self.destination.tokens[self.position]) @cache def lemmas(self): """Tuple of lemmas of the replaced and replacing words.""" return (self.source.lemmas[self.start + self.position], self.destination.lemmas[self.position])
[docs]def _copy(string, table, columns): """Execute a PostgreSQL COPY command. COPY is one of the fastest methods to import data in bulk into PostgreSQL. This function executes this operation through the raw psycopg2 :class:`cursor` object. Parameters ---------- string : file-like object Contents of the data to import into the database, formatted for the COPY command (see `PostgreSQL's documentation <https://www.postgresql.org/docs/9.5/static/sql-copy.html>`_ for more details). Can be an :class:`io.StringIO` if you don't want to use a real file in the filesystem. table : str Name of the table into which the data is imported. columns : list of str List of the column names encoded in the `string` parameter. When `string` is produced using :meth:`Quote.format_copy` or :meth:`Cluster.format_copy` you can use the corresponding :attr:`Quote.format_copy_columns` or :attr:`Cluster.format_copy_columns` for this parameter. See Also -------- save_by_copy, Quote.format_copy, Cluster.format_copy """ string.seek(0) with session_scope() as session: cursor = session.connection().connection.cursor() cursor.copy_from(string, table, columns=columns)
[docs]def save_by_copy(clusters, quotes): """Import a list of clusters and a list of quotes into the database. This function uses PostgreSQL's COPY command to bulk import clusters and quotes, and prints its progress to stdout. Parameters ---------- clusters : list of :class:`Cluster`\ s List of clusters to import in the database. quotes : list of :class:`Quote`\ s List of quotes to import in the database. Any clusters they reference should be in the `clusters` parameter. See Also -------- .load.MemeTrackerParser.parse """ # Order the objects inserted so the engine bulks them together. logger.debug("Saving %s clusters with 'copy_from'", len(clusters)) click.echo('Saving clusters... ', nl=False) objects = StringIO() objects.writelines([cluster.format_copy() + '\n' for cluster in clusters]) _copy(objects, Cluster.__tablename__, Cluster.format_copy_columns) objects.close() click.secho('OK', fg='green', bold=True) logger.debug("Saving %s quotes with 'copy_from'", len(quotes)) click.echo('Saving quotes... ', nl=False) objects = StringIO() objects.writelines([quote.format_copy() + '\n' for quote in quotes]) _copy(objects, Quote.__tablename__, Quote.format_copy_columns) objects.close() click.secho('OK', fg='green', bold=True)