From af4b6ceeb371e347cf8255fe041a07eddf4efcaa Mon Sep 17 00:00:00 2001 From: Rich Wareham <rjw57@cam.ac.uk> Date: Mon, 21 Jan 2019 13:45:31 +0000 Subject: [PATCH 1/4] initial implementation Add an initial implementation of the scheduler application. The initial implementation contains: * Documentation (README, an API reference and getting started guide) * Test harness using tox * docker-compose based development environment * A Dockerfile * An ingest loop for Google Sheets * A stub implementation of the Opencast scheduler which prints what events should be scheduled. --- .coveragerc | 7 + .dockerignore | 20 ++ .editorconfig | 13 + .flake8 | 3 + .gitignore | 109 ++++++++ .gitlab-ci.yml | 57 ++++ Dockerfile | 14 + LICENSE | 21 ++ README.md | 57 ++++ compose.sh | 31 +++ compose/base.env | 2 + compose/base.yml | 2 + compose/development.env | 2 + compose/development.yml | 21 ++ compose/tox.env | 5 + compose/tox.yml | 18 ++ configuration-template.yaml | 16 ++ doc/conf.py | 170 ++++++++++++ doc/gettingstarted.rst | 53 ++++ doc/hacking.rst | 34 +++ doc/index.rst | 13 + doc/reference.rst | 6 + doc/requirements.txt | 3 + doc/sheet-example.csv | 9 + requirements.txt | 16 ++ scheduler/__init__.py | 43 +++ scheduler/__main__.py | 4 + scheduler/config.py | 53 ++++ scheduler/events.py | 36 +++ scheduler/googlesheets.py | 395 +++++++++++++++++++++++++++ scheduler/loop.py | 152 +++++++++++ scheduler/opencast.py | 24 ++ scheduler/tests/__init__.py | 0 scheduler/tests/test_googlesheets.py | 219 +++++++++++++++ scheduler/tests/test_loop.py | 140 ++++++++++ scheduler/tool.py | 53 ++++ scheduler_development.sh | 14 + setup.py | 27 ++ tox.ini | 70 +++++ tox.sh | 26 ++ 40 files changed, 1958 insertions(+) create mode 100644 .coveragerc create mode 100644 .dockerignore create mode 100644 .editorconfig create mode 100644 .flake8 create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100755 compose.sh create mode 100644 compose/base.env create mode 100644 compose/base.yml create mode 100644 compose/development.env create mode 100644 compose/development.yml create mode 100644 compose/tox.env create mode 100644 compose/tox.yml create mode 100644 configuration-template.yaml create mode 100644 doc/conf.py create mode 100644 doc/gettingstarted.rst create mode 100644 doc/hacking.rst create mode 100644 doc/index.rst create mode 100644 doc/reference.rst create mode 100644 doc/requirements.txt create mode 100644 doc/sheet-example.csv create mode 100644 requirements.txt create mode 100644 scheduler/__init__.py create mode 100644 scheduler/__main__.py create mode 100644 scheduler/config.py create mode 100644 scheduler/events.py create mode 100644 scheduler/googlesheets.py create mode 100644 scheduler/loop.py create mode 100644 scheduler/opencast.py create mode 100644 scheduler/tests/__init__.py create mode 100644 scheduler/tests/test_googlesheets.py create mode 100644 scheduler/tests/test_loop.py create mode 100644 scheduler/tool.py create mode 100755 scheduler_development.sh create mode 100644 setup.py create mode 100644 tox.ini create mode 100755 tox.sh diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..f15a473 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,7 @@ +[run] +omit= + doc/conf.py + setup.py + */test/* + */tests/* + .tox/* diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..6557eb4 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,20 @@ +.dockerignore +Dockerfile +*.sqlite3 +__pycache__ +*.pyc +*.pyo +*.pyd +.Python +env +pip-log.txt +pip-delete-this-directory.txt +.tox +.coverage +.coverage.* +.cache +coverage.xml +*,cover +*.log +.git +build diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..ff5d1b3 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,13 @@ +# Editorconfig file for cross-platform configuration of editors. +root=true + +[*.py] +max_line_length=99 + +[*.{yml,yaml}] +indent_style=space +indent_size=2 + +[*.md] +indent_style=space +indent_size=2 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..353401b --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length=99 +exclude = venv,.tox,*/migrations/*,ui/frontend/*,build/* diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ebeff24 --- /dev/null +++ b/.gitignore @@ -0,0 +1,109 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +*.sqlite3 + +# PyCharm +.idea +.scheduler diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..4cba28e --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,57 @@ +# This file pulls in the GitLab AutoDevOps configuration[1] via an include +# directive and then overrides bits. The rationale for this is we'd like this +# file to eventually have zero local overrides so that we can use the AutoDevOps +# pipeline as-is. + +include: + # Bring in the AutoDevOps template from GitLab. + # + # TODO: when we ship GitLab 11.7, replace this with include:template: + - 'https://gitlab.com/gitlab-org/gitlab-ee/raw/master/lib/gitlab/ci/templates/Auto-DevOps.gitlab-ci.yml' + +variables: + # Disable bits of the AutoDevOps pipeline which we don't use. + CODE_QUALITY_DISABLED: "true" + LICENSE_MANAGEMENT_DISABLED: "true" + PERFORMANCE_DISABLED: "true" + SAST_DISABLED: "true" + DEPENDENCY_SCANNING_DISABLED: "true" + CONTAINER_SCANNING_DISABLED: "true" + DAST_DISABLED: "true" + REVIEW_DISABLED: "true" + +# Note: this environment contains only the configuration which differs from the +# AutoDevOps "test" environment. +test: + image: docker:stable-git + + services: + - docker:stable-dind + + before_script: + # Show some information on the docker install + - docker info + + script: + # Configure the test environment to be able to run docker-compose. + - setup_docker + - apk add bash py-pip + - pip install docker-compose + + # The ./tox.sh script will build the image implicitly if it thinks it is out + # of date but we do it explicitly here to fail early if there is an issue + # with the image. + - ./compose.sh tox build + + # Run the tests. + - ./tox.sh + + # Look for the summary line output from coverage's text report. The + # parentheses are used to indicate which portion of the report contains the + # coverage percentage. + coverage: '/^TOTAL\s+\d+\s+\d+\s+(\d+)%$/' + + variables: + # Disable to bind mounting of the repository inside the tox container. This + # allows us to test the code which ended up in the production image. + DISABLE_BIND_MOUNT: "true" diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..880d8a1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,14 @@ +FROM uisautomation/python:3.7-alpine + +WORKDIR /usr/src/app + +# Install specific requirements for the package. +ADD requirements.txt ./ +RUN pip install -r requirements.txt + +# Copy application source and install it. Use "-e" to avoid needlessly copying +# files into the site-packages directory. +ADD ./ ./ +RUN pip install -e . + +ENTRYPOINT ["scheduler"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2b96072 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 University of Cambridge Information Services + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index e69de29..38e927c 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,57 @@ +# Lecture capture scheduling engine + +This repository contains a tool to schedule lecture capture recordings. It has +the following features: + +* Ingest events to be scheduled from Google Sheets. +* Respect lecturer opt-in/out preferences. (TODO) +* Reconciles Opencast events with the schedule. (TODO) +* Sets appropriate role-based access policies on Opencast events. (TODO) + +## Further documentation + +Usage and API documentation may be built using tox: + +```bash +$ COMPOSE_ARGS="-v $PWD/build/:/tmp/tox-data/artefacts/" ./tox.sh -e doc +$ xdg-open build/doc/index.html # Linux +$ open build/doc/index.html # Mac +``` + +## Quickstart + +> This quickstart is an abbreviated form of the getting started guide from the +> main documentation. + +The ``./scheduler_development.sh`` script will build a containerised version of +the tool and run it with the repository directory mounted read-only under +``/usr/src/app`` inside the container. As such you can have development-local +configuration inside the repository. + +When *first running* the tool, you will need to create some configuration. (See +the "Configuration" section in the documentation for what is required.) + +```bash +$ cd /path/to/this/repo +$ mkdir .scheduler +$ cp configuration-template.yaml .scheduler/configuration.yaml +# ... edit configuration, see below ... +``` + +Once configured, the scheduler can be run as follows: + +```bash +$ ./scheduler_development.sh +``` + +> The ``scheduler_development.sh`` script can be used to run the scheduler utility +> within a development Docker image. The repository directory is mounted read-only +> as ``/usr/src/app`` so that local modifications take effect. + +## Running tests + +The tests may be run using tox: + +```bash +$ ./tox.sh +``` diff --git a/compose.sh b/compose.sh new file mode 100755 index 0000000..c775e35 --- /dev/null +++ b/compose.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# +# Wrapper script to run services via docker-compose. Usage: +# +# ./compose.sh <config> [<args>...] +# +# Will be expanded into: +# +# docker-compose --file compose/base.yml --file compose/<config>.yml <args>... +# +# With <args>... defaulting to "up" if not specified. + +config=$1 +shift +args=${@:-up} + +# Exit on failure +set -e + +# Check some config was provided +if [ -z "${config}" ]; then + echo "No configuration specified." >&2 + exit 1 +fi + +# Change to this script's directory +cd "$( dirname "${BASH_SOURCE[0]}")" + +set -x +exec docker-compose --project-name lecture-capture-scheduler \ + --file compose/base.yml --file compose/${config}.yml $args diff --git a/compose/base.env b/compose/base.env new file mode 100644 index 0000000..3c6ab30 --- /dev/null +++ b/compose/base.env @@ -0,0 +1,2 @@ +# Environment variables which should be set when running the application +# within the development *or* production Docker container. diff --git a/compose/base.yml b/compose/base.yml new file mode 100644 index 0000000..2c20322 --- /dev/null +++ b/compose/base.yml @@ -0,0 +1,2 @@ +# docker-compose file containing shared resources +version: '3.2' diff --git a/compose/development.env b/compose/development.env new file mode 100644 index 0000000..c8d232f --- /dev/null +++ b/compose/development.env @@ -0,0 +1,2 @@ +# Environment variables which should be set when running the application +# within the development Docker container. diff --git a/compose/development.yml b/compose/development.yml new file mode 100644 index 0000000..767334a --- /dev/null +++ b/compose/development.yml @@ -0,0 +1,21 @@ +# docker-compose file for local development +version: '3.2' +services: + development: + build: + context: .. + dockerfile: ./Dockerfile + entrypoint: ["python", "-m", "scheduler"] + # Mount the local directory inside the container as a volume to allow local + # changes to be reflected without having to re-build the container. + volumes: + - type: bind + source: ../ + target: /usr/src/app + read_only: true + env_file: + - base.env + - development.env + +volumes: + egg-info: diff --git a/compose/tox.env b/compose/tox.env new file mode 100644 index 0000000..1b032ef --- /dev/null +++ b/compose/tox.env @@ -0,0 +1,5 @@ +# Environment variables which should be set when running the tests via tox. +TOXINI_WORK_DIR=/tmp/tox-data/work +TOXINI_ARTEFACT_DIR=/tmp/tox-data/artefacts +TOXINI_SITEPACKAGES=True +COVERAGE_FILE=/tmp/tox-data/coverage diff --git a/compose/tox.yml b/compose/tox.yml new file mode 100644 index 0000000..178baa4 --- /dev/null +++ b/compose/tox.yml @@ -0,0 +1,18 @@ +# docker-compose file for testing with container image. +version: '3.2' +services: + tox: + build: + context: .. + dockerfile: ./Dockerfile + entrypoint: ["tox"] + volumes: + - tox-data:/tmp/tox-data + env_file: + - base.env + - tox.env + +volumes: + # A persistent volume for tox to store its stuff. This allows caching of + # virtualenvs between runs. + tox-data: diff --git a/configuration-template.yaml b/configuration-template.yaml new file mode 100644 index 0000000..600037d --- /dev/null +++ b/configuration-template.yaml @@ -0,0 +1,16 @@ +--- +# Scheduler configuration template. + +# Configuration of Google sheets ingest. +sheets: + # Service account credentials. If non-absolute, this path is relative to the + # current working directory when the script is run. + service_account_credentials_path: ./.scheduler/credentials.json + + # List of Google sheet keys which represent sheets to be ingested. These keys + # are the long random-looking string present in the URL for a sheet. + # + # In order to be ingested, the sheet must be shared with the email address of + # the service account. It is preferable if the sheet is shared "view only". + keys: + - 'some-long-obscure-string' diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..87e7239 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Lecture Capture Scheduler documentation build configuration file, +# created by sphinx-quickstart on Fri Dec 8 11:20:11 2017. +# +# This file is execfile()d with the current directory set to its containing +# dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out serve +# to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys + +sys.path.insert(0, os.path.abspath('..')) + + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.githubpages'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Lecture Capture Scheduler' +copyright = '2019, University of Cambridge Information Services' +author = 'University of Cambridge Information Services' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '' +# The full version, including alpha/beta/rc tags. +release = '' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# This is required for the alabaster theme +# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars +html_sidebars = { + '**': [ + 'relations.html', # needs 'show_related': True theme option to display + 'searchbox.html', + ] +} + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Schedulerdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Scheduler.tex', + 'Lecture Capture Scheduler Documentation', + 'University of Cambridge Information Services', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'scheduler', + 'Lecture Capture Scheduler Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Scheduler', + 'Lecture Capture Scheduler Documentation', + author, 'Scheduler', 'One line description of project.', + 'Miscellaneous'), +] diff --git a/doc/gettingstarted.rst b/doc/gettingstarted.rst new file mode 100644 index 0000000..3c1b409 --- /dev/null +++ b/doc/gettingstarted.rst @@ -0,0 +1,53 @@ +Getting Started +=============== + +Firstly, clone the repository: + +.. code:: shell-session + + $ git clone git@tlab.developers.cam.ac.uk:uis/devops/lecture-capture/scheduler + $ cd lecture-capture-scheduler + +After cloning, you will need to create some initial configuration: + +.. code:: shell-session + + $ mkdir .scheduler + $ cp configuration-template.yaml .scheduler/configuration.yaml + +The configuration template includes information on what configuration is +required: + +.. literalinclude:: ../configuration-template.yaml + :language: yaml + +Google Sheets +------------- + +In order to ingest data from Google Sheets, you will need to create a service +account and download JSON-formatted credentials for it. `Instructions on +creating a service account and credentials +<https://developers.google.com/identity/protocols/OAuth2ServiceAccount#creatinganaccount>`_ +is available on Google's website. Place the credentials inside the +``.scheduler`` directory created above in a file named ``credentials.json``. + +You will then need to create a Google Sheet with the correct schema. An example +CSV file with the correct schema is as follows: + +.. literalinclude:: sheet-example.csv + :language: csv + +This sheet should be shared view-only with the email address of the service +account. + +Add the Sheet keys for each Sheet to the ``keys`` section of the configuration. + +Run the scheduler +----------------- + +After completing the configuration, the scheduler may be run the +``scheduler_development.sh`` script: + +.. code:: shell-session + + $ ./scheduler_development.sh diff --git a/doc/hacking.rst b/doc/hacking.rst new file mode 100644 index 0000000..ccd93d4 --- /dev/null +++ b/doc/hacking.rst @@ -0,0 +1,34 @@ +Overview for Developers +======================= + +The scheduler application is naturally concurrent in that it repeatedly +interacts with network APIs and has multiple sub-tasks running at the same time. +In order to simplify the concurrence, the scheduler uses the `asyncio module +<https://docs.python.org/3/library/asyncio.html>`_ from Python 3. + +Main loop +--------- + +The main loop is found in :py:mod:`scheduler.loop`. Its job is to launch the +various ingest tasks, maintain a central scheduler state and to launch the +scheduler backend tasks when the event schedule changes. + +The scheduler state is encapsulated within the +:py:class:`~scheduler.loop.SchedulerState` class. This class contains all the +logic to track which events have been ingested and which should actually be +scheduled. The current list of events which should be scheduled can be retrieved +via the :py:meth:`~scheduler.loop.SchedulerState.get_schedule` method. + +The scheduler state is initialised with a list of backend coroutines. These +coroutines are launched whenever the list of events to schedule changes. They +are called with the scheduler state object as the first argument. + +Ingest loops +------------ + +The main loop launches ingest loops as long-running tasks. Each ingest loop +receives an :py:class:`asyncio.Queue` object which they can use to inform the +scheduler of new or modified events. When new events are ingested, a **list** of +:py:class:`scheduler.events.Event` objects should be put on the queue. Events +which were previously posted and which should now be cancelled are represented +by setting a "cancelled" flag on the event. diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..5df83b6 --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,13 @@ +Lecture Capture Events Scheduler +================================ + +The Lecture Capture Events Scheduler ingests upcoming timetabled lectures for +the University and schedules them with an Opencast system. + +.. toctree:: + :maxdepth: 2 + :caption: Contents + + gettingstarted + hacking + reference diff --git a/doc/reference.rst b/doc/reference.rst new file mode 100644 index 0000000..1422bd3 --- /dev/null +++ b/doc/reference.rst @@ -0,0 +1,6 @@ +API Reference +============= + +.. automodule:: scheduler + :members: + :member-order: bysource diff --git a/doc/requirements.txt b/doc/requirements.txt new file mode 100644 index 0000000..5e2c4ce --- /dev/null +++ b/doc/requirements.txt @@ -0,0 +1,3 @@ +# Additional requirements for building documentation +sphinx +sphinx_rtd_theme diff --git a/doc/sheet-example.csv b/doc/sheet-example.csv new file mode 100644 index 0000000..48d5d4a --- /dev/null +++ b/doc/sheet-example.csv @@ -0,0 +1,9 @@ +id,cancelled,lecturer_crsids,start_at,duration_minutes,title,vle_url,sequence_id,sequence_index,location_id,series_id +11e64fa2-fe9d-4204-8e2a-a3cf48feb5e9,,"abc12,sprq2",2018-11-28T14:38:00Z,1,"Title 1 +",https://www.vle.cam.ac.uk/replace-with-real-api/courses/70241,https://somesystem.invalid/sequences/ABC987 ,1,room-1,testing +d90d14b9-4af2-47aa-a3b0-ed8b65491801,,,2018-11-28T18:00:00Z,7,Title 2,https://www.vle.cam.ac.uk/replace-with-real-api/courses/70241,https://somesystem.invalid/sequences/ABC987 ,1,room-1,testing +e0296186-574d-4734-a491-3c2e1444999b,,,2018-11-28T14:05:00Z,1,"Title 3 +",https://www.vle.cam.ac.uk/replace-with-real-api/courses/70241,https://somesystem.invalid/sequences/ABC987 ,1,room-1,testing +04a2e477-be3b-493e-b1bf-7164b6b48c91,,,2018-11-28T14:55:00Z,104,"Title 1 +",https://www.vle.cam.ac.uk/replace-with-real-api/courses/70241,https://somesystem.invalid/sequences/ABC987 ,1,room-2,testing +3134ab7a-f2a6-406b-92d7-b4ca46401252,,,2018-11-28T14:43:00Z,98,Title 3,https://www.vle.cam.ac.uk/replace-with-real-api/courses/70241,https://somesystem.invalid/sequences/ABC987 ,1,room-2,testing \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..16be4d4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +docopt +google-api-python-client +gspread +gspread-asyncio +oauth2client +python-dateutil +pyyaml + +# httplib2shim - required to avoid connection pooling issues with the Google API client. +# The ordering here is important, according to the httplib2shim docs. +urllib3[secure] +httplib2shim + +# Required to run the tests. So that we may test the production image, tox is +# included in production image builds. +tox diff --git a/scheduler/__init__.py b/scheduler/__init__.py new file mode 100644 index 0000000..caede4b --- /dev/null +++ b/scheduler/__init__.py @@ -0,0 +1,43 @@ +""" +The Lecture Capture Events scheduler is a concurrent application written using the Python +`asyncio library <https://docs.python.org/3/library/asyncio.html>`_. It's job is to ingest Lecture +Capture events from a event sources, determine which events should be scheduled and schedule them +with the Opencast framework. + +Events +------ + +.. automodule:: scheduler.events + :members: + :member-order: bysource + +Google Sheets ingest +-------------------- + +.. automodule:: scheduler.googlesheets + :members: + :private-members: + :member-order: bysource + +Opencast scheduler +------------------ + +.. automodule:: scheduler.opencast + :members: + :member-order: bysource + +Main loop +--------- + +.. automodule:: scheduler.loop + :members: + :member-order: bysource + +Utilities +--------- + +.. automodule:: scheduler.config + :members: + :member-order: bysource + +""" diff --git a/scheduler/__main__.py b/scheduler/__main__.py new file mode 100644 index 0000000..f625c43 --- /dev/null +++ b/scheduler/__main__.py @@ -0,0 +1,4 @@ +from scheduler.tool import main + +if __name__ == '__main__': + main() diff --git a/scheduler/config.py b/scheduler/config.py new file mode 100644 index 0000000..2d4b446 --- /dev/null +++ b/scheduler/config.py @@ -0,0 +1,53 @@ +""" +Utilities for parsing configuration files. + +""" + +import logging +import os + +import yaml + +LOG = logging.getLogger(__name__) + + +class ConfigurationError(RuntimeError): + pass + + +class ConfigurationNotFound(ConfigurationError): + def __init__(self): + return super().__init__('Could not find any configuration file') + + +def load_configuration(location=None): + """ + Load configuration and return a :py:class:`Configuration` instance. Pass a non-None location to + override the default search path. + + :raises: ConfigurationError if the configuration could not be loaded. + + """ + if location is not None: + paths = [location] + else: + if 'SCHEDULER_CONFIGURATION' in os.environ: + paths = [os.environ['SCHEDULER_CONFIGURATION']] + else: + paths = [] + paths.extend([ + os.path.join(os.getcwd(), '.scheduler/configuration.yaml'), + os.path.expanduser('~/.scheduler/configuration.yaml'), + '/etc/scheduler/configuration.yaml' + ]) + + valid_paths = [path for path in paths if os.path.isfile(path)] + + if len(valid_paths) == 0: + LOG.error('Could not find configuration file. Tried:') + for path in paths: + LOG.error('"%s"', path) + raise ConfigurationNotFound() + + with open(valid_paths[0]) as f: + return yaml.load(f) diff --git a/scheduler/events.py b/scheduler/events.py new file mode 100644 index 0000000..be0ae83 --- /dev/null +++ b/scheduler/events.py @@ -0,0 +1,36 @@ +""" +Events are represented by a :py:class:`.Event` instance. + +""" +import dataclasses +import datetime +import typing + + +@dataclasses.dataclass +class Event: + """ + An event specified by an ingest source. This class is a `dataclass + <https://docs.python.org/3/library/dataclasses.html>`_ and supports all the dataclass methods. + + """ + id: str + cancelled: bool + lecturer_crsids: typing.Sequence[str] + title: str + start_at: datetime.datetime + duration: datetime.timedelta + vle_url: str + sequence_id: str + sequence_index: int + location_id: str + series_id: str + + +@dataclasses.dataclass +class EventMetadata: + """ + Metadata added to an event by the scheduler. + + """ + agent_id: str diff --git a/scheduler/googlesheets.py b/scheduler/googlesheets.py new file mode 100644 index 0000000..f1581a7 --- /dev/null +++ b/scheduler/googlesheets.py @@ -0,0 +1,395 @@ +""" +Ingest of events from Google Sheets. + +""" +import asyncio +import dataclasses +import datetime +import logging +import typing + +import dateutil.parser +import dateutil.tz +import googleapiclient.discovery +import gspread_asyncio +import httplib2shim # more modern httplib2 shim +from oauth2client.service_account import ServiceAccountCredentials + +from .events import Event + +LOG = logging.getLogger(__name__) + +# Scopes required to access Google spreadsheets +SCOPES = [ + 'https://spreadsheets.google.com/feeds', + 'https://www.googleapis.com/auth/drive' +] + +# Expected column headings +HEADINGS = [ + 'id', 'cancelled', 'lecturer_crsids', 'start_at', 'duration_minutes', 'title', 'vle_url', + 'sequence_id', 'sequence_index', 'location_id', 'series_id' +] + +#: Default delay (in seconds) between polls to the Drive API. Changes to Google Sheets are slow to +#: manifest themselves in the modification time so there is no advantage in this being any smaller +#: than around 5 minutes. +DEFAULT_POLL_DELAY = 60*5 + + +class ParseError(RuntimeError): + """An exception raised if the parsed Google sheet has an invalid format.""" + + +@dataclasses.dataclass +class Configuration: + """ + Configuration for the Google Sheets ingest. + + """ + # Path to JSON service account credentials + service_account_credentials_path: str + + # List of keys representing spreadsheets to read + keys: typing.List[str] + + # How long to wait between polling Google API for changes + poll_delay_seconds: typing.Union[int, float] = DEFAULT_POLL_DELAY + + @classmethod + def from_dict(cls, dict_): + """ + Construct a :py:class:`.Configuration` instance. + + """ + field_names = {field.name for field in dataclasses.fields(cls)} + required_field_names = { + field.name for field in dataclasses.fields(cls) + if field.default is dataclasses.MISSING + } + + for key in dict_.keys(): + if key not in field_names: + raise ValueError(f'Unknown configuration key: {key}') + + for key in required_field_names: + if key not in dict_: + raise ValueError(f'{key}: required field not set') + + return cls(**dict_) + + +async def loop(*, queue, configuration): + """ + Coroutine which notices changes to watched Google spreadsheets and pushes lists of new + :py:class:`scheduler.events.Event` instances to *queue*. + + :param queue: queue which receives lists of new events + :type queue: asyncio.Queue + + """ + # Load credentials + LOG.info('Loading credentials from: %s', configuration.service_account_credentials_path) + credentials = ServiceAccountCredentials.from_json_keyfile_name( + configuration.service_account_credentials_path, SCOPES) + + # The Google discovery API client prints warnings about not being able to use a file cache + # which is benign but includes an exception traceback which can confuse some log parsers. Set + # the log level for that module to ERROR to silence them. + logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR) + + # Create a Google drive client + drive_service = await _build_drive_service(credentials) + + # Now we've built the service, reset the log level to NOTSET. Unfortunately, there's not an + # official way of determining the actual log level of the logger to reset it to exactly what it + # was before. + logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.NOTSET) + + # Ensure sequence of keys is a set + keys = set(configuration.keys) + + # The following loop asks the Google Drive API which sheets have changed since the latest + # modification time we're aware of. The new latest modification time is remembered for the next + # iteration of the loop and any sheets which have changed which we're interested in are passed + # to _fetch_and_parse_sheets. + # + # There appears to be some latency between Google sheets being updated and it being reflected + # in the modification time. Delays of up to 5 minutes have been observed. Consequently it's + # counter-productive to poll too frequently in this loop. + latest_modified_time = None + while True: + # Form a query for the spreadsheets we're interested in. + query = "mimeType = 'application/vnd.google-apps.spreadsheet' and trashed = false" + if latest_modified_time is not None: + LOG.info('Looking for modified time after %s', latest_modified_time) + query = query + f" and modifiedTime > '{latest_modified_time}'" + else: + LOG.info('Fetching all sheets') + + # Get file metadata from Drive API. + files = await _drive_list_files( + drive_service, query=query, extra_fields=[ + 'files/modifiedTime', 'files/id' + ] + ) + + # If we found at least one file, update the cached latest modified time to the latest value + # we get from the API response so that next time around we only get the files which have + # changed since. + if len(files) > 0: + latest_modified_time = max( + files, key=lambda metadata: dateutil.parser.isoparse(metadata['modifiedTime']) + )['modifiedTime'] + LOG.info('New latest modified time: %r', latest_modified_time) + + # Filter returned files to those we're interested in. + keys_to_process = [ + metadata['id'] for metadata in files if metadata['id'] in keys + ] + + # Process sheets we're interested in. This co-routine logs exceptions but does not re-raise + # them. + await _fetch_and_parse_sheets(credentials, keys_to_process, queue) + + # Wait for the next poll time. + LOG.info('Sleeping for %s seconds', configuration.poll_delay_seconds) + await asyncio.sleep(configuration.poll_delay_seconds) + + +async def _fetch_and_parse_sheets(credentials, keys, queue): + """ + Co-routine which process all passed sheet keys. Exceptions from processing are captured and + logged but are not re-raised to the caller. + + """ + # Create a Google Sheets API client. We have to do this here because we will likely go for long + # periods before we load data from Google sheets and the authorisation token used by the client + # has limited lifetime. + sheets = await gspread_asyncio.AsyncioGspreadClientManager(lambda: credentials).authorize() + + results = await asyncio.gather(*[_fetch_and_parse_sheet(sheets, key, queue) for key in keys]) + for result, key in zip(results, keys): + if isinstance(result, Exception): + LOG.error('sheet %s: error: %s', key, result, exc_info=result) + + +async def _fetch_and_parse_sheet(sheets, key, queue): + """ + Co-routine which processes an individual sheet. Any exceptions from processing the sheet are + raised back to the caller. Exceptions for individual rows are logged but are note re-raised. + + """ + sheet = await sheets.open_by_key(key) + + LOG.info('sheet: %s: getting rows for first worksheet', key) + worksheet = await sheet.get_worksheet(0) + rows = await worksheet.get_all_values() + LOG.info('sheet: %s: got %s row(s)', key, len(rows)) + await queue.put(parse_sheet(rows)) + + +def parse_sheet(rows): + """ + Parse the rows of a sheet. + + :param rows: sequence of sequences of strings representing the rows of the sheet + + """ + if len(rows) < 1: + raise ParseError('There must be at least one heading row') + + headings, data = rows[0], rows[1:] + if any(expected != got for expected, got in zip(HEADINGS, headings)): + raise ParseError(f'Heading row mismatch. Expected "{HEADINGS}", got "{headings}"') + + # Iterate over each data row, converting it into a dictionary and passing to parse_event + ingest_events = [] + for datum in data: + datum_dict = {key: value for key, value in zip(headings, datum)} + event_id = datum_dict.get('id', '{unknown}').strip() + try: + event = parse_event(datum_dict) + except ParseError as e: + LOG.warning('Skipping event %s which failed to parse: %s', event_id, e, exc_info=e) + else: + ingest_events.append(event) + + return ingest_events + + +def parse_event(event_dict): + """ + Parse a single event row. Returns a tuple (cancelled, event). If cancelled is True, event is + only guaranteed to contain the event id. + + :param event_dict: a dictionary representation of a row + :raises: :py:class:`.ParseError` if the event cannot be parsed. + + """ + # We use _get_str for most fields but the id needs to be strictly checked. + event_id = event_dict.get('id') + if event_id is None or not isinstance(event_id, str) or event_id.strip() == '': + raise ParseError(f'invalid event id: "{event_id}"') + + cancelled = _get_str(event_dict, 'cancelled') + if cancelled == 'Y': + is_cancelled = True + elif cancelled == '' or cancelled == 'N': + is_cancelled = False + else: + raise ParseError(f'cancelled should be empty, "Y" or "N". It was: "{cancelled}"') + + # Strip any leading/trailing whitespace from id. + event_id = event_id.strip() + + # Parse lecturer crsids list + lecturer_crsids_list = _get_str(event_dict, 'lecturer_crsids') + lecturer_crsids = [ + crsid.strip() for crsid in lecturer_crsids_list.split(',') + if crsid.strip() != '' + ] + + # Parse starting date and duration + try: + start_at = _iso8601_parse(_get_str(event_dict, 'start_at')) + except ValueError: + raise ParseError( + 'Event has invalid start time: %r' % _get_str(event_dict, 'start_at')) + + try: + duration = datetime.timedelta(minutes=int(_get_str(event_dict, 'duration_minutes'))) + except ValueError: + raise ParseError( + 'Event has invalid duration: %r' % _get_str(event_dict, 'duration_minutes')) + + if duration.total_seconds() <= 0: + raise ParseError(f'Event has invalid duration: {duration}') + + sequence_id = _get_str(event_dict, 'sequence_id') + if sequence_id == '': + raise ParseError('empty sequence id') + try: + sequence_index = int(_get_str(event_dict, 'sequence_index')) + except ValueError: + raise ParseError('invalid sequence index: "%s"' % (_get_str(event_dict, 'sequence_index'))) + if sequence_index < 0: + raise ParseError(f'invalid sequence index: {sequence_index}') + + location_id = _get_str(event_dict, 'location_id') + if location_id == '': + raise ParseError('empty location id') + + series_id = _get_str(event_dict, 'series_id') + if series_id == '': + raise ParseError('empty series id') + + return Event( + id=event_id, + cancelled=is_cancelled, + lecturer_crsids=lecturer_crsids, + title=_get_str(event_dict, 'title'), + start_at=start_at, duration=duration, + vle_url=_get_str(event_dict, 'vle_url'), + sequence_id=sequence_id, sequence_index=sequence_index, + location_id=location_id, + series_id=series_id + ) + + +def _get_str(d, key, default=None, *, strip=True): + """ + Return a key's value from dictionary *d* as a string. If *strip* is True, also strip leading + and trailing whitespace. If the key does not exist, use *default* as the value + + >>> _get_str({'foo': 34}, 'foo') + '34' + >>> _get_str({'foo': ' bar '}, 'foo') + 'bar' + >>> _get_str({'foo': None}, 'foo') + 'None' + >>> _get_str({'foo': ' bar '}, 'foo', strip=False) + ' bar ' + >>> _get_str({'foo': 'bar'}, 'buzz') + 'None' + >>> _get_str({'foo': 'bar'}, 'buzz', default=' bar ') + 'bar' + >>> _get_str({'foo': 'bar'}, 'buzz', ' bar ') + 'bar' + + """ + value = str(d.get(key, default)) + return value if not strip else value.strip() + + +def _iso8601_parse(datetime_str): + """ + Parse an ISO8601 formatted date string to a datetime object. The returned object is always in + the UTC timezone and appropriate timezone conversion is applied from the input string. + + >>> _iso8601_parse('2006-01-02T15:04:05-07:00') + datetime.datetime(2006, 1, 2, 22, 4, 5, tzinfo=tzutc()) + >>> _iso8601_parse('2006-01-02') + datetime.datetime(2006, 1, 2, 0, 0, tzinfo=tzutc()) + >>> _iso8601_parse('2006-01-02T-07') + datetime.datetime(2006, 1, 2, 7, 0, tzinfo=tzutc()) + >>> _iso8601_parse('not a date') + Traceback (most recent call last): + ... + ValueError: ... + >>> _iso8601_parse('') + Traceback (most recent call last): + ... + ValueError: ... + + """ + return dateutil.parser.isoparse(datetime_str).astimezone(dateutil.tz.tzutc()) + + +async def _build_drive_service(credentials): + """ + Co-routine which builds a Google Drive API service using the Google API client libraries. Runs + the code within the default asyncio executor. + + """ + def build_service(): + http = httplib2shim.Http() + return googleapiclient.discovery.build('drive', 'v3', http=credentials.authorize(http)) + return await asyncio.get_running_loop().run_in_executor(None, build_service) + + +async def _drive_list_files(drive_service, query=None, extra_fields=None): + """ + Co-routine which uses the Google drive API to get a list of files. Automatically handles paging + as necessary to retrieve a full list. + + """ + # Get the current asyncio event loop so that we can call the drive API in the loop's default + # thread pool. + loop = asyncio.get_running_loop() + + # List of file metadata resources to return + files = [] + + # List of fields in metadata to return + fields = ','.join(extra_fields) if extra_fields is not None else None + + # Loop while we wait for nextPageToken to be "none" + page_token = None + while True: + list_response = await loop.run_in_executor( + None, drive_service.files().list( + corpora='user,allTeamDrives', supportsTeamDrives=True, fields=fields, + includeTeamDriveItems=True, pageToken=page_token, q=query + ).execute + ) + + # Add returned files to response list + files.extend(list_response.get('files', [])) + + # Get the token for the next page + page_token = list_response.get('nextPageToken') + if page_token is None: + break + + return files diff --git a/scheduler/loop.py b/scheduler/loop.py new file mode 100644 index 0000000..ce8257d --- /dev/null +++ b/scheduler/loop.py @@ -0,0 +1,152 @@ +""" +Main event loop for the scheduler. + +""" +import asyncio +import logging + +from . import events +from . import googlesheets +from . import opencast + +LOG = logging.getLogger(__name__) + + +def run(configuration): + # Run main loop(s) + asyncio.run(main_loop(configuration)) + + +async def main_loop(configuration): + """ + Coroutine which runs all the main loops concurrently until they all exit. + + """ + # Initialise the scheduler state with a list of coroutines which should be scheduled when the + # state changes. + state = SchedulerState(handler_coros=[opencast.update_state]) + + # HACK: hard-code lecturer preferences and location -> agent mappings. + state.opt_ins |= {'spqr1', 'abc2'} + state.location_to_agents.update({ + 'room-1': 'agent-a', + 'room-2': 'agent-b', + }) + + # A list of co-routines which represent ingest loops. Each loop will ingest a set of events. + # It then puts sequences of these events on the scheduler state queue. + ingest_loops = [ + googlesheets.loop( + queue=state.queue, + configuration=googlesheets.Configuration.from_dict(configuration.get('sheets', {})) + ) + ] + + # Run long-lived ingest loops concurrently. Any one loop raising an exception will cause the + # entire task to terminate with that exception. This ensures that things fail loud rather than + # silently causing one of our task loops to be silently not running. + return await asyncio.gather(*ingest_loops) + + +class SchedulerState: + """ + The state of the scheduler. Contains information on all events ingested, the current lecture + opt-in/out state and the mapping from room locations to agent ids. + + """ + def __init__(self, *, handler_coros=[]): + # List of coroutines which should be scheduled when state changes + self._handler_coros = handler_coros + + # Create in-memory database of events keyed by id + self.events = {} + + # Create in-memory database of crsids which have opt-ed in to lecture capture + self.opt_ins = set() + + # Create in-memory database of mappings between location ids and agent names. + self.location_to_agents = {} + + # Queue to send event bundles to + self.queue = asyncio.Queue() + + # Schedule event queue listener + asyncio.ensure_future(self._listen_for_events()) + + def get_schedule(self): + """ + Return a sequence of (event, metadata) pairs which show which events should be scheduled at + this moment in time. Events which should not be scheduled are filtered out of the list. + + """ + events_and_metadata = [] + + for event in self.events.values(): + # Don't schedule this event if it is cancelled + if event.cancelled: + continue + + # Form event metadata from in-memory databases of agents + metadata = events.EventMetadata( + agent_id=self.location_to_agents.get(event.location_id)) + + # Don't schedule this event if we don't know the agent. + if metadata.agent_id is None: + LOG.warning( + 'Skipping event "%s" as location "%s" could not be mapped to an agent id', + event.id, event.location_id + ) + continue + + # Don't schedule this event if any lecturer has not opted-in. + if any(crsid not in self.opt_ins for crsid in event.lecturer_crsids): + LOG.warning( + 'Skipping event "%s" as not all of crsids %s have opted-in', + event.id, ', '.join(event.lecturer_crsids) + ) + LOG.warning('Opt-in statuses: %s', ', '.join([ + '{}: {}'.format(crsid, crsid in self.opt_ins) + for crsid in event.lecturer_crsids + ])) + continue + + # This event has passwd all the tests, it can be scheduled. + events_and_metadata.append((event, metadata)) + + return events_and_metadata + + async def _listen_for_events(self): + """ + Co-routine which loops waiting for new events on the event queue and updates the internal + state accordingly. Calls state change handler co-routines. + + """ + LOG.info('Waiting for new events') + while True: + # Form a new mapping from event id to events which will replace the self.events state. + new_events = {} + new_events.update(self.events) + + # Get all incoming event bundles from the queue. + while True: + new_events.update({event.id: event for event in await self.queue.get()}) + if self.queue.empty(): + break + + # Update state atomically. This state update is atomic since our concurrency is + # co-routine based and so we know we will not be interrupted until the next await. + self.events = new_events + LOG.info('Total number of events: %s', len(self.events)) + + # Notify handlers of change in state. + await self._state_changed() + + async def _state_changed(self): + """ + Coroutine. Call after changing internal state to notify state change handlers. The return + value can be await-ed to wait for state change handlers to complete. + + """ + # TODO: if multiple co-routines start notifying about state change, this should be + # re-factored to use something like asyncio.Condition. + return await asyncio.gather(*[f(self) for f in self._handler_coros]) diff --git a/scheduler/opencast.py b/scheduler/opencast.py new file mode 100644 index 0000000..bfbb446 --- /dev/null +++ b/scheduler/opencast.py @@ -0,0 +1,24 @@ +""" +Opencast scheduling backend. + +""" +import logging + +LOG = logging.getLogger(__name__) + + +async def update_state(state): + """ + Co-routine called when the local state has changed. + + """ + # Get the current schedule + schedule = state.get_schedule() + LOG.info('Events changed: %s in total, %s to schedule', len(state.events), len(schedule)) + + # Until scheduling via API is implemented, log what would be scheduled. + for event, metadata in schedule: + LOG.info('Would schedule %r, %r', event, metadata) + + # TODO: implement scheduling. Since we're running in a co-routine, this should be done using + # an asyncio-aware library such as aiohttp. diff --git a/scheduler/tests/__init__.py b/scheduler/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scheduler/tests/test_googlesheets.py b/scheduler/tests/test_googlesheets.py new file mode 100644 index 0000000..7e6ecb9 --- /dev/null +++ b/scheduler/tests/test_googlesheets.py @@ -0,0 +1,219 @@ +import datetime +import json +import unittest + +from dateutil.tz import tzutc + +from .. import googlesheets + +# Encode the valid event template as JSON to make doubly sure none of the tests below mutate it. +VALID_EVENT_TEMPLATE = json.dumps({ + 'id': 'abc-123', + 'cancelled': '', + 'lecturer_crsids': 'spqr1', + 'start_at': '2006-01-02T15:04:05-07:00', + 'duration_minutes': '47', + 'title': 'Test event', + 'vle_url': 'https://vle.invalid/course?id=45', + 'sequence_id': 'seq-xyz', + 'sequence_index': '1', + 'location_id': 'null_island', + 'series_id': 'botolphs-study' +}) + + +class ParseSheetTestCase(unittest.TestCase): + def setUp(self): + # copy the headings list so that it may be safely mutated + self.expected_headings = list(googlesheets.HEADINGS) + + # Create two valid events in a valid sheet + self.rows = [googlesheets.HEADINGS] + d = json.loads(VALID_EVENT_TEMPLATE) + d['id'] = 'event-1' + self.rows.append([d[k] for k in googlesheets.HEADINGS]) + d = json.loads(VALID_EVENT_TEMPLATE) + d['id'] = 'event-2' + d['start_at'] = '2006-01-02T15:04:05Z' + self.rows.append([d[k] for k in googlesheets.HEADINGS]) + + def test_empty_sheet(self): + """An empty sheet should raise a ParseError.""" + with self.assertRaises(googlesheets.ParseError): + googlesheets.parse_sheet([]) + + def test_empty_sheet_with_headers(self): + """An empty sheet but with expected headers should succeed.""" + googlesheets.parse_sheet([self.expected_headings]) + + def test_empty_sheet_with_extra_headers(self): + """An empty sheet but with expected headers and extra columns should succeed.""" + googlesheets.parse_sheet([self.expected_headings + ['foo', 'bar']]) + + def test_empty_sheet_with_missing_headers(self): + """An empty sheet but with a missing header should raise a parse error.""" + del self.expected_headings[1] + del self.expected_headings[-2] + with self.assertRaises(googlesheets.ParseError): + googlesheets.parse_sheet([self.expected_headings]) + + def test_basic_parse(self): + """A simple valid sheet succeeds.""" + events = googlesheets.parse_sheet(self.rows) + self.assertEqual(len(events), 2) + + def test_parse_error(self): + """A sheet with a parse error in the first row still parses the second.""" + self.rows[1][googlesheets.HEADINGS.index('start_at')] = 'not-a-date' + events = googlesheets.parse_sheet(self.rows) + self.assertEqual(len(events), 1) + + +class ParseEventTestCase(unittest.TestCase): + def setUp(self): + # Create a valid event dict + self.event_dict = json.loads(VALID_EVENT_TEMPLATE) + + def test_valid_event(self): + """A valid event parses without error.""" + googlesheets.parse_event(self.event_dict) + + def test_id_parse(self): + """Event id field is parsed correctly.""" + self.assert_event_parse('id', 'xyz-123') + self.assert_event_parse('id', '', raises=True) + self.assert_event_parse('id', ' xyz-123 ', expected_value='xyz-123') + + def test_start_at_parse(self): + """Event start_at field is parsed correctly.""" + self.assert_event_parse( + 'start_at', '2006-01-02T15:04:05-07:00', + datetime.datetime(2006, 1, 2, 22, 4, 5, tzinfo=tzutc()) + ) + + def test_duration_minutes_parse(self): + """Event duration_minutes field is parsed correctly.""" + def to_mins(n): + return datetime.timedelta(minutes=n) + self.assert_event_parse('duration_minutes', '123', to_mins(123), event_name='duration') + self.assert_event_parse('duration_minutes', ' 123 ', to_mins(123), event_name='duration') + self.assert_event_parse('duration_minutes', 'not-an-int', raises=True) + self.assert_event_parse('duration_minutes', '0', raises=True) + self.assert_event_parse('duration_minutes', '-10', raises=True) + + def test_lecturer_crsids_parse(self): + """Event lecturer_crsids field is parsed correctly.""" + self.assert_event_parse('lecturer_crsids', '', []) + self.assert_event_parse('lecturer_crsids', ',,,', []) + self.assert_event_parse('lecturer_crsids', ' abc1 ', ['abc1']) + self.assert_event_parse( + 'lecturer_crsids', ' abc1, spqr2, xyz3 ', ['abc1', 'spqr2', 'xyz3']) + self.assert_event_parse('lecturer_crsids', ' abc1,, , xyz3 ', ['abc1', 'xyz3']) + + def test_title_parse(self): + """Event title field is parsed correctly.""" + self.assert_event_parse('title', 'TESTING') + self.assert_event_parse('title', ' TESTING ', 'TESTING') + + def test_vle_url_parse(self): + """Event vle_url field is parsed correctly.""" + self.assert_event_parse('vle_url', 'http://vle.invalid/some/path') + + def test_sequence_id_parse(self): + """Event sequence_id field is parsed correctly.""" + self.assert_event_parse('sequence_id', 'some-id') + self.assert_event_parse('sequence_id', '', raises=True) + self.assert_event_parse('sequence_id', ' some-id ', expected_value='some-id') + + def test_sequence_index_parse(self): + """Event sequence_index field is parsed correctly.""" + self.assert_event_parse('sequence_index', '987', 987) + self.assert_event_parse('sequence_index', ' 987 ', 987) + self.assert_event_parse('sequence_index', '0', 0) + self.assert_event_parse('sequence_index', 'not-an-int', raises=True) + self.assert_event_parse('sequence_index', '-10', raises=True) + self.assert_event_parse('sequence_index', '', raises=True) + + def test_location_id_parse(self): + """Event location_id field is parsed correctly.""" + self.assert_event_parse('location_id', 'some-loc-id') + self.assert_event_parse('location_id', ' some-loc-id ', 'some-loc-id') + self.assert_event_parse('location_id', '', raises=True) + + def test_series_id_parse(self): + """Event series_id field is parsed correctly.""" + self.assert_event_parse('series_id', 'some-series-id') + self.assert_event_parse('series_id', ' some-series-id ', 'some-series-id') + self.assert_event_parse('series_id', '', raises=True) + + def test_required_fields(self): + """Empty required fields cause a parse error.""" + self.assert_event_parse('start_at', '', raises=True) + self.assert_event_parse('duration_minutes', '', raises=True) + self.assert_event_parse('sequence_id', '', raises=True) + self.assert_event_parse('sequence_index', '', raises=True) + self.assert_event_parse('location_id', '', raises=True) + self.assert_event_parse('series_id', '', raises=True) + + def test_cancelled_parse(self): + """Event cancelled field is parsed correctly.""" + self.assert_event_parse('cancelled', '', expected_value=False) + self.assert_event_parse('cancelled', ' ', expected_value=False) + self.assert_event_parse('cancelled', 'Y', expected_value=True) + self.assert_event_parse('cancelled', 'N', expected_value=False) + self.assert_event_parse('cancelled', ' Y ', expected_value=True) + self.assert_event_parse('cancelled', ' N ', expected_value=False) + self.assert_event_parse('cancelled', '?', raises=True) + + def assert_event_parse( + self, field_name, dict_value, expected_value=None, raises=False, event_name=None): + expected_value = expected_value if expected_value is not None else dict_value + event_name = event_name if event_name is not None else field_name + self.event_dict[field_name] = dict_value + if raises: + with self.assertRaises(googlesheets.ParseError): + googlesheets.parse_event(self.event_dict) + else: + event = googlesheets.parse_event(self.event_dict) + self.assertEqual(getattr(event, event_name), expected_value) + + +class ConfigurationTestCase(unittest.TestCase): + def setUp(self): + self.valid_configuration = { + 'service_account_credentials_path': '/data/credentials.json', + 'keys': ['abcdefg', '12345678'] + } + + def test_basic_functionality(self): + """A valid configuration is parsed.""" + config = googlesheets.Configuration.from_dict(self.valid_configuration) + self.assertEqual( + config.service_account_credentials_path, + self.valid_configuration['service_account_credentials_path'] + ) + self.assertEqual(config.keys, self.valid_configuration['keys']) + self.assertEqual(config.poll_delay_seconds, googlesheets.DEFAULT_POLL_DELAY) + + def test_missing_values(self): + """Required keys are required.""" + for k in ['service_account_credentials_path', 'keys']: + d = {} + d.update(self.valid_configuration) + del d[k] + with self.assertRaises(ValueError): + googlesheets.Configuration.from_dict(d) + + def test_poll_delay(self): + """Overriding the poll delay is possible.""" + self.valid_configuration['poll_delay_seconds'] = googlesheets.DEFAULT_POLL_DELAY + 10.5 + config = googlesheets.Configuration.from_dict(self.valid_configuration) + self.assertEqual(config.poll_delay_seconds, googlesheets.DEFAULT_POLL_DELAY + 10.5) + + def test_unknown_keys(self): + """Unknown keys raise an exception.""" + d = {} + d.update(self.valid_configuration) + d['foo'] = 'bar' + with self.assertRaises(ValueError): + googlesheets.Configuration.from_dict(d) diff --git a/scheduler/tests/test_loop.py b/scheduler/tests/test_loop.py new file mode 100644 index 0000000..628ec66 --- /dev/null +++ b/scheduler/tests/test_loop.py @@ -0,0 +1,140 @@ +import dataclasses +import datetime +import random +import unittest +import uuid + +from .. import events +from .. import loop + + +def _random_string(k=20): + """ + Use random's RNG to generate a random string + + """ + return ''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=k)) + + +# Create some random data. Use a fixed seed for reproducibility. +random.seed(0xdeadbeef) +FAKE_CRSIDS = ['abc1', 'def1', 'spqr2', 'xyz89'] +FAKE_SEQUENCES = [_random_string() for _ in range(10)] +FAKE_LOCATIONS = [_random_string() for _ in range(10)] +FAKE_SERIES = [_random_string() for _ in range(10)] + +# Mapping from location to agent +LOCATION_TO_AGENTS = {loc: _random_string() for loc in FAKE_LOCATIONS} + + +class StateTestCase(unittest.TestCase): + def setUp(self): + # Reset RNG seed to ensure results are reproducible + random.seed(0xface1234) + + # Create a blank state + self.state = loop.SchedulerState() + + # Populate the state with some valid events + self._populate_state() + + def test_basic_functionality(self): + """ + A state populated with events which should all be scheduled schedules all the events. + + """ + schedule = self.state.get_schedule() + events = self.state.events + + # More than 0 events were ingested and they are all scheduled + scheduled_event_ids = {e.id for e, _ in schedule} + event_ids = set(events.keys()) + self.assertGreater(len(events), 0) + self.assertEqual(scheduled_event_ids, event_ids) + + for event, metadata in schedule: + self.assertEqual(metadata.agent_id, LOCATION_TO_AGENTS[event.location_id]) + + def test_cancelled_events(self): + """ + Cancelling an event removes it from the schedule. + + """ + scheduled_event_ids_before = self._get_scheduled_event_ids() + new_event = self._update_random_event(cancelled=True) + self.assertIn(new_event.id, scheduled_event_ids_before) + self.assertNotIn(new_event.id, self._get_scheduled_event_ids()) + + def test_lack_of_opt_in(self): + """ + Removing a lecturer opt-in removes it from the schedule. + + """ + test_crsid = 'testing12345' + new_event = self._update_random_event(lecturer_crsids=[test_crsid]) + + # An event without an opt-in from lecturer is not scheduled + self.assertNotIn(new_event.id, self._get_scheduled_event_ids()) + + # Adding the opt-in schedules the event + self.state.opt_ins |= {test_crsid} + self.assertIn(new_event.id, self._get_scheduled_event_ids()) + + def _populate_state(self, event_count=10): + """ + Populate the state with events. No events are cancelled and all lecturers will have opted + in. + + """ + events = [self._fake_event() for _ in range(event_count)] + opt_ins = set() + for e in events: + e.cancelled = False + for crsid in e.lecturer_crsids: + opt_ins.add(crsid) + + self.state.opt_ins |= opt_ins + self.state.location_to_agents.update(LOCATION_TO_AGENTS) + self._update_state_events(events) + + def _update_state_events(self, events): + self.state.events.update({e.id: e for e in events}) + + def _update_random_event(self, **changes): + """ + Update a random event with the changes passed. Return the new event. The event is selected + from all events in the state, not just the scheduled ones. + + """ + e_id = random.choice(sorted(e.id for e in self.state.events.values())) + new_event = dataclasses.replace(self.state.events[e_id], **changes) + self._update_state_events([new_event]) + return new_event + + def _get_scheduled_event_ids(self): + """ + Return a set of event ids which are currently in the schedule. + + """ + return {e.id for e, _ in self.state.get_schedule()} + + def _fake_event(self): + """ + Create a fake populated event. + + """ + return events.Event( + id=uuid.uuid4().hex, + cancelled=random.choice([True, False]), + lecturer_crsids=random.sample(FAKE_CRSIDS, random.randint(0, len(FAKE_CRSIDS))), + title=_random_string(), + start_at=datetime.datetime(day=1, month=12, year=2016) + datetime.timedelta( + seconds=random.randint(0, 60*60*24*265*10) + ), + duration=datetime.timedelta(seconds=random.randint(1, 60*60*3)), + vle_url=_random_string(), + sequence_id=random.choice(FAKE_SEQUENCES), + sequence_index=random.randint(1, 10), + location_id=random.choice(FAKE_LOCATIONS), + series_id=random.choice(FAKE_SERIES) + ) diff --git a/scheduler/tool.py b/scheduler/tool.py new file mode 100644 index 0000000..5e72745 --- /dev/null +++ b/scheduler/tool.py @@ -0,0 +1,53 @@ +""" +Schedule Lecture Capture Recordings + +Usage: + scheduler (-h | --help) + scheduler [--quiet] [--configuration=PATH] + +Options: + + -h, --help Show a brief usage summary. + --quiet Reduce logging verbosity. + --configuration=PATH Override location of configuration file (see below). + +Configuration: + + Configuration of the tool is via a YAML document. The default location for the configuration is + the first file which exists in the following locations: + + - The value of the SCHEDULER_CONFIGURATION environment variable + - ./.scheduler/configuration.yaml + - ~/.scheduler/configuration.yaml + - /etc/scheduler/configuration.yaml + + The --configuration option may be used to override the search path. + +""" +import logging +import sys + +import docopt + +from . import config, loop + + +LOG = logging.getLogger(__name__) + + +def main(): + opts = docopt.docopt(__doc__) + logging.basicConfig(level=logging.WARN if opts['--quiet'] else logging.INFO) + + # Load the configuration + try: + configuration = config.load_configuration(location=opts.get('--configuration')) + except config.ConfigurationError as e: + LOG.error('Could not load configuration: %s', e) + sys.exit(1) + + try: + loop.run(configuration) + except Exception as e: + LOG.error('error: %s', e, exc_info=e) + sys.exit(1) diff --git a/scheduler_development.sh b/scheduler_development.sh new file mode 100755 index 0000000..53dd65e --- /dev/null +++ b/scheduler_development.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# +# Wrapper script to run manage.py in development. Arguments are passed directly +# to manage.py + +# Exit on failure +set -e + +# Change to this script's directory +cd "$( dirname "${BASH_SOURCE[0]}")" + +# Execute scheduler, logging command used +set -x +exec ./compose.sh development run --rm development $@ diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0fc099a --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +import os + +from setuptools import find_packages, setup + + +def load_requirements(): + """ + Load requirements file and return non-empty, non-comment lines with leading and trailing + whitespace stripped. + """ + with open(os.path.join(os.path.dirname(__file__), 'requirements.txt')) as f: + return [ + line.strip() for line in f + if line.strip() != '' and not line.strip().startswith('#') + ] + + +setup( + name='scheduler', + install_requires=load_requirements(), + packages=find_packages(), + entry_points={ + 'console_scripts': [ + 'scheduler=scheduler.tool:main' + ], + }, +) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..eb0990d --- /dev/null +++ b/tox.ini @@ -0,0 +1,70 @@ +# Tox runner configuration +# +# The following optional environment variables can change behaviour. See the +# comments where they are used for more information. +# +# - TOXINI_ARTEFACT_DIR +# - TOXINI_FLAKE8_VERSION +# - TOXINI_WORK_DIR +# +[tox] +# Envs which should be run by default. +envlist=flake8,doc,py3 +# Allow overriding toxworkdir via environment variable +toxworkdir={env:TOXINI_WORK_DIR:{toxinidir}/.tox} +# Avoid .egg-info directories +skipsdist=True + +# The "_vars" section is ignored by tox but we place some useful shared +# variables in it to avoid needless repetition. +[_vars] +# Where to write build artefacts. We default to the "build" directory in the +# tox.ini file's directory. Override with the TOXINI_ARTEFACT_DIR environment +# variable. +build_root={env:TOXINI_ARTEFACT_DIR:{toxinidir}/build} + +[testenv] +# Additional dependencies +deps= + -rrequirements.txt + coverage + pytest + pytest-cov +# Which environment variables should be passed into the environment. +passenv= +# Allow people to override the coverage report location should they so wish. + COVERAGE_FILE +# Location of the coverage.xml file + COVERAGE_XML_FILE +# How to run the test suite. Note that arguments passed to tox are passed on to +# the test command. +commands= + pytest --doctest-modules --cov={toxinidir} --junitxml={[_vars]build_root}/{envname}/junit.xml + coverage html --directory {[_vars]build_root}/{envname}/htmlcov/ + coverage xml -o {env:COVERAGE_XML_FILE:{[_vars]build_root}/{envname}/coverage.xml} +# Allow sitepackages setting to be overridden via TOX_SITEPACKAGES environment +# variable. The tox container uses this to avoid re-installing the same packages +# over and over again. +sitepackages={env:TOXINI_SITEPACKAGES:False} + +[testenv:py3] +basepython=python3 + +# Build documentation +[testenv:doc] +basepython=python3 +deps= + -rdoc/requirements.txt +commands=sphinx-build -a -v -b html doc/ {[_vars]build_root}/doc/ + +# Check for PEP8 violations +[testenv:flake8] +basepython=python3 +deps= +# We specify a specific version of flake8 to avoid introducing "false" +# regressions when new checks are introduced. The version of flake8 used may +# be overridden via the TOXINI_FLAKE8_VERSION environment variable. + flake8=={env:TOXINI_FLAKE8_VERSION:3.6.0} +commands= + flake8 --version + flake8 . diff --git a/tox.sh b/tox.sh new file mode 100755 index 0000000..0303e14 --- /dev/null +++ b/tox.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# +# Wrapper script to run tox. Arguments are passed directly to tox. +# +# The COMPOSE_ARGS environment variable may be used to pass additional arguments +# to docker-compose. +# +# If DISABLE_BIND_MOUNT is set, the repository root will *not* be added as a +# read-only bind mount. This is mostly useful if you want to test the production +# image "as is". + +# Exit on failure +set -e + +# Change to this script's directory +cd "$( dirname "${BASH_SOURCE[0]}")" + +if [ -z "${DISABLE_BIND_MOUNT}" ]; then + VOLUME_ARGS="-v $PWD:/usr/src/app:ro" +else + VOLUME_ARGS= +fi + +# Execute tox runner, logging command used +set -x +exec ./compose.sh tox run --rm $VOLUME_ARGS $COMPOSE_ARGS tox $@ -- GitLab From f287cc18a2efe4ef5b750fb83ca915095d279929 Mon Sep 17 00:00:00 2001 From: Rich Wareham <rjw57@cam.ac.uk> Date: Thu, 31 Jan 2019 09:45:13 +0000 Subject: [PATCH 2/4] gitlab-ci: use include:template to bring in AutoDevOps Now we're on GitLab 11.7, we can use include:template: to bring in the default AutoDevOps template. --- .gitlab-ci.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4cba28e..41b64d0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -5,9 +5,7 @@ include: # Bring in the AutoDevOps template from GitLab. - # - # TODO: when we ship GitLab 11.7, replace this with include:template: - - 'https://gitlab.com/gitlab-org/gitlab-ee/raw/master/lib/gitlab/ci/templates/Auto-DevOps.gitlab-ci.yml' + - template: Auto-DevOps.gitlab-ci.yml variables: # Disable bits of the AutoDevOps pipeline which we don't use. -- GitLab From 9e767180080f31293e5e482c54193371e8b9eeae Mon Sep 17 00:00:00 2001 From: Rich Wareham <rjw57@cam.ac.uk> Date: Thu, 31 Jan 2019 10:21:19 +0000 Subject: [PATCH 3/4] events: document Event fields --- scheduler/events.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/scheduler/events.py b/scheduler/events.py index be0ae83..328d912 100644 --- a/scheduler/events.py +++ b/scheduler/events.py @@ -14,16 +14,43 @@ class Event: <https://docs.python.org/3/library/dataclasses.html>`_ and supports all the dataclass methods. """ + #: A unique id which is stable when other event data changes. id: str + + #: Flag which indicates that this event which was previously scheduled should now be cancelled. cancelled: bool + + #: A list of crsids for "lecturers". (I.e. people who have had to opt-in to this recording.) lecturer_crsids: typing.Sequence[str] + + #: A descriptive title for the event. title: str + + #: The date and time at which the event starts. start_at: datetime.datetime + + #: The duration of the event. duration: datetime.timedelta + + #: A URL pointing to a related course in the VLE. vle_url: str + + #: A unique id which specifies the sequence of lectures which this event is part of. sequence_id: str + + #: The index within the sequence at which this event should sit. sequence_index: int + + #: A unique id which specifies the physical location where the event takes place. location_id: str + + #: A unique id which specifies which series this event is a part of. Note that the "series" and + #: "sequence" fields are only weakly related. The "series" field relates to how events should + #: grouped when presented to the user and the "sequence" field describes how they are grouped + #: from an organisational PoV. + #: + #: If one has no information to the contrary, one may use the convention of having one series + #: for each VLE course. series_id: str @@ -33,4 +60,6 @@ class EventMetadata: Metadata added to an event by the scheduler. """ + #: A unique id which specifies which lecture capture agent has been given responsibility for + #: recording the event. agent_id: str -- GitLab From acee2781616de4bf4e94f674be7f1ca89cb6732c Mon Sep 17 00:00:00 2001 From: Rich Wareham <rjw57@cam.ac.uk> Date: Thu, 31 Jan 2019 10:31:50 +0000 Subject: [PATCH 4/4] loop: use "create_task" from Python 3.7. In Python 3.7, asyncio.create_task() is preferred over asyncio.ensure_future(). --- scheduler/loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scheduler/loop.py b/scheduler/loop.py index ce8257d..4732e9d 100644 --- a/scheduler/loop.py +++ b/scheduler/loop.py @@ -71,7 +71,7 @@ class SchedulerState: self.queue = asyncio.Queue() # Schedule event queue listener - asyncio.ensure_future(self._listen_for_events()) + asyncio.create_task(self._listen_for_events()) def get_schedule(self): """ -- GitLab