diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000000000000000000000000000000000000..737e33f684a0edb74cc6e037f34f164b416ae542 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,4 @@ +[run] +omit = + .tox/* + setup.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..0384309049d7979e29df8789264f71d7e58b4657 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,6 @@ +.git + +# Various build directories +.tox +build +*.egg-info diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000000000000000000000000000000000000..ff5d1b3626574517bba81357e741ff1556b82ae7 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,13 @@ +# Editorconfig file for cross-platform configuration of editors. +root=true + +[*.py] +max_line_length=99 + +[*.{yml,yaml}] +indent_style=space +indent_size=2 + +[*.md] +indent_style=space +indent_size=2 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000000000000000000000000000000000..a984856b2260dc1e6fdd4f36333001ef8b3ebefc --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length=99 +exclude = venv,.tox diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..ae493c97c759b6742de79c0c6c44c58b64df12bc --- /dev/null +++ b/.gitignore @@ -0,0 +1,114 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +*.sqlite3 + +# PyCharm +.idea + +# Service account credentials (if instructions in README are followed) +credentials.json + +# Local configuration +gsuitesync.yaml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..8825656abcc390566716dd7d1aa901099332650e --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,17 @@ +# This file pulls in the GitLab AutoDevOps configuration via an include +# directive and then overrides bits. The rationale for this is we'd like this +# file to eventually have zero local overrides so that we can use the AutoDevOps +# pipeline as-is. + +include: + # Bring in the AutoDevOps template from GitLab. + # It can be viewed at: + # https://gitlab.com/gitlab-org/gitlab-ee/blob/master/lib/gitlab/ci/templates/Auto-DevOps.gitlab-ci.yml + - template: Auto-DevOps.gitlab-ci.yml + + # Overrides to AutoDevOps for testing + - project: 'uis/devops/continuous-delivery/ci-templates' + file: '/auto-devops/tox-tests.yml' + +variables: + DOCUMENTATION_DISABLED: "1" diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..01ad27fe9db7ff16354fa48166aaf1cdfd8e672d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +# This Dckerfile is intended only to support the Auto-DevOps pipeline on GitLab. +# It's not intended to package the application. + +FROM uisautomation/python:3.7-alpine + +WORKDIR /usr/src/app + +# Install specific requirements for the package along with tox to be able to run +# the tests. +ADD requirements.txt ./ +RUN pip install tox && pip install -r requirements.txt + +# Copy application source and install it. +ADD ./ ./ +RUN pip install -e ./ + +ENTRYPOINT ["gcp-sql-backup"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..2b96072572a7e51f6222943204d5b0d9853c8643 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 University of Cambridge Information Services + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 00ee4c977962e54791913f7bf4dcfa00ae60a3ac..9c26e59f9ce3a39e275df925f83138bc6f6a8e2f 100644 --- a/README.md +++ b/README.md @@ -1 +1,95 @@ # Google GSuite Synchronisation Tool + +This repository contains a custom synchronisation tool for synchronising +information from the [Lookup service](https://www.lookup.cam.ac.uk/)'s LDAP +personality to a Google hosted domain (aka "GSuite"). + +Configuration is performed via a configuration file. Take a look at the [example +configuration file](configuration-example.yaml) for more information. + +## Usage + +The tool can be invoked from the command line: + +```console +$ gsuitesync +``` + +By default this will log what will be done. To actually perform the +synchronisation: + +```console +$ gsuitesync --really-do-this +``` + +See the output of ``gsuitesync --help`` for more information on valid +command-line flags. + +Unless overridden on the command line, the tool searches for its configuration +file in the following places in the following order: + +* A ``gsuitesync.yaml`` file in the current directory. +* ``~/.gsuitesync/configuration.yaml``. +* ``/etc/gsuitesync/configuration.yaml``. + +The first located file is used. + +## Installation + +The command-line tool can be installed directly from the git repository: + +```console +$ pip3 install git+https://gitlab.developers.cam.ac.uk/uis/gsuite/synctool.git +``` + +For developers, the script can be installed from a cloned repo using ``pip``: + +```console +$ cd /path/to/this/repo +$ pip3 install -e . +``` + +## New users + +When new users are created they are created with a random password which is +immediately thrown away. They are created with a primary email of the form +``[uid]@[domain]`` where ``[uid]`` is the unique id from lookup (i.e. the CRSid) +and ``[domain]`` is the name of the Google domain from the configuration. + +## Required API scopes + +This tool requires the following OAuth2 scopes to audition the changes to be +made: + +* ``https://www.googleapis.com/auth/admin.directory.user.readonly`` + +This tool requires the following OAuth2 scopes to actually perform changes: + +* ``https://www.googleapis.com/auth/admin.directory.user`` + +See the section on preparing a service account for information on how to grant a +service account those scopes on your domain. + +## Preparing a service account + +This tool assumes it will be acting as a service account user. It will use this +service account user to then act on behalf of an admin user in GSuite. To +prepare such a service account user: + +1. Create a service account in the Google Console for this script. +2. Generate and download JSON credentials for the service account. +3. Under "IAM" > "Service Accounts", select the service account, click "Edit", + click "Show domain-wide delegation" and "Enable G Suite Domain-wide + Delegation". Click "Save" to apply the changes. +4. Hover over the "?" symbol next to the generated client id and click "view + client". Copy the Client ID from the popup panel. +5. In the GSuite admin panel, go to "Security Settings" > "Advanced Settings" > + "Manage API client access". +6. Paste in the service account Client ID as "Client Name" and add a + comma-separated list of scopes. See the section on required API scopes. + +The scary-sounding "Enable G Suite Domain-wide Delegation" means that this +service account is marked as being willing to "su" to another Google user. By +adding the generated Client ID to the GSuite security settings you are, as +domain administrator, giving that service account the ability to act as any user +in the domain **subject to the listed scopes**. diff --git a/configuration-example.yaml b/configuration-example.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d0e106c38cc4f83e8b89ab39e5411dd09d3f0f8 --- /dev/null +++ b/configuration-example.yaml @@ -0,0 +1,90 @@ +# Example of Google directory sync configuration. + +# Synchronisation configuration +sync: + # A regular expression which is used to match the organization unit path for + # Google users who should be excluded from the list returned by Google. Those + # users do not exist for the purposes of the rest of the sync and so if they + # appear in the list of managed users this script will attempt to re-add them + # and fail in the process. Use this setting for users who are managed + # completely outside of this script. + ignore_google_org_unit_path_regex: '^/Service Accounts$' + +# Configure limits defining maximum scope of changes. +limits: + # The abort_... settings below are safety limits and will abort the run if the + # limits are violated. They are there to define the "sane limits" for an + # update. + + # Refuse to perform sync if we are to "touch" more than this percentage of + # users. The percentage of users "touched" is calculated as + # + # (new google users + modified google users) / max(1, total google users) + # + # where "modified" includes metadata changes and suspension/restoration. As + # such this calculated percentage can be greater than 100. Set to null to + # have no limit. Default: null. + abort_user_change_percentage: 2 # percent + + # The max_... settings below will not abort the run if the number of users + # affected is greater than the specified number. Instead the number of users + # affected is capped to that number. The selection of which users are included + # in the capped number is arbitrary. + + # Limit the number of new user creations per run. This is an absolute number. + # Set to null to have no limit. Default: null. + max_new_users: 100 + + # Limit the number of user suspensions per run. This is an absolute number. + # Set to null to have no limit. Default: null. + max_suspended_users: 100 + + # Limit the number of user un-suspensions (reactivations) per run. This is an + # absolute number. Set to null to have no limit. Default: null. + max_reactivated_users: 100 + + # Limit the number of user metadata changes per run. This is an absolute + # number. Set to null to have no limit. Default: null + max_updated_users: 100 + +# Google API configuration +google_api: + # Authentication + auth: + # Path to on-disk JSON credentials used when accessing the API. + credentials: "./credentials.json" + + # Path to on-disk JSON credentials used when accessing the API in + # "read-only" mode. Use this if you want to have a separate "safe" service + # account which can only read data. If null, use the same credentials for + # reading and writing. Default: null. + read_only_credentials: null + +# Details about the LDAP server +ldap: + # Scheme and hostname of the LDAP server. + host: 'ldaps://ldap.example.com' + + # LDAP search base. Filters are always relative to this. + search_base: 'ou=people,o=example-corps,dc=example,dc=com' + + # Filter to use to determine the "eligible" list of users. If a non-admin user + # is found on Google who isn't in this list, their account will be suspended. + eligible_user_filter: '(uid=*)' + + # Filter to use to determine the "managed" list of users. If a user appears in + # this list who isn't in Google their account is created. If the user metadata + # for a user in this list changes, the change is propagated to Google. If + # null, the value of "eligible_user_filter" is used. Default: null. + managed_user_filter: null + +# Details about the Google Domain we're managing. +google_domain: + # Name of the domain. + name: 'example.com' + + # Username within the GSuite for the user which has administration rights. + # Should be an e-mail style name. E.g. "super-admin@example.com". The service + # account credentials specified in the google_api.auth section are used to + # perform admin actions as this user. + admin_user: 'super-admin@example.com' diff --git a/gsuitesync/__init__.py b/gsuitesync/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3677d732b116622e1b99a3f41b305cce5be649a1 --- /dev/null +++ b/gsuitesync/__init__.py @@ -0,0 +1,45 @@ +""" +Synchronise users to GSuite + +Usage: + gsuitesync (-h | --help) + gsuitesync [--configuration=FILE] [--quiet] [--really-do-this] + +Options: + -h, --help Show a brief usage summary. + + --quiet Reduce logging verbosity. + + --configuration=FILE Specify configuration file to load. + + --really-do-this Actually try to make the changes. + +""" +import logging +import os +import sys + +import docopt + +from . import config +from . import sync + + +LOG = logging.getLogger(os.path.basename(sys.argv[0])) + + +def main(): + opts = docopt.docopt(__doc__) + + # Configure logging + logging.basicConfig(level=logging.WARN if opts['--quiet'] else logging.INFO) + + # HACK: make the googleapiclient.discovery module less spammy in the logs + logging.getLogger('googleapiclient.discovery').setLevel(logging.WARN) + logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR) + + LOG.info('Loading configuration') + configuration = config.load_configuration(opts['--configuration']) + + # Perform sync + sync.sync(configuration, read_only=not opts['--really-do-this']) diff --git a/gsuitesync/__main__.py b/gsuitesync/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..c7c70d0be2f94f22b62f5b9fea21536367fe62e8 --- /dev/null +++ b/gsuitesync/__main__.py @@ -0,0 +1,4 @@ +from . import main + +if __name__ == '__main__': + main() diff --git a/gsuitesync/config.py b/gsuitesync/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4f51965719b94eccfcc40a527e21cdf087f4885f --- /dev/null +++ b/gsuitesync/config.py @@ -0,0 +1,90 @@ +""" +Utilities for parsing configuration files. + +""" +import dataclasses +import logging +import os + +import yaml + +LOG = logging.getLogger(__name__) + + +class ConfigurationError(RuntimeError): + """ + Base class for all configuration errors. + + """ + + +class ConfigurationNotFound(ConfigurationError): + """ + A suitable configuration could not be located. + + """ + def __init__(self): + return super().__init__('Could not find any configuration file') + + +def load_configuration(location=None): + """ + Load configuration and return a :py:class:`Configuration` instance. Pass a non-None location to + override the default search path. + + :raises: ConfigurationError if the configuration could not be loaded. + + """ + if location is not None: + paths = [location] + else: + if 'GSUITESYNC_CONFIGURATION' in os.environ: + paths = [os.environ['GSUITESYNC_CONFIGURATION']] + else: + paths = [] + paths.extend([ + os.path.join(os.getcwd(), 'gsuitesync.yaml'), + os.path.expanduser('~/.gsuitesync/configuration.yaml'), + '/etc/gsuitesync/configuration.yaml' + ]) + + valid_paths = [path for path in paths if os.path.isfile(path)] + + if len(valid_paths) == 0: + LOG.error('Could not find configuration file. Tried:') + for path in paths: + LOG.error('"%s"', path) + raise ConfigurationNotFound() + + with open(valid_paths[0]) as f: + return yaml.safe_load(f) + + +class ConfigurationDataclassMixin: + """ + Mixin class for dataclass which adds a "from_dict" member which will construct an instance from + a dictionary. Fields which have no default value become required fields. + + """ + + @classmethod + def from_dict(cls, dict_): + """ + Construct an instance from a dict. + + """ + field_names = {field.name for field in dataclasses.fields(cls)} + required_field_names = { + field.name for field in dataclasses.fields(cls) + if field.default is dataclasses.MISSING + } + + for key in dict_.keys(): + if key not in field_names: + raise ValueError(f'Unknown configuration key: {key}') + + for key in required_field_names: + if key not in dict_: + raise ValueError(f'{key}: required field not set') + + return cls(**dict_) diff --git a/gsuitesync/gapiauth.py b/gsuitesync/gapiauth.py new file mode 100644 index 0000000000000000000000000000000000000000..436e7696a9562658888d450f37b2ba1dd3bd74ba --- /dev/null +++ b/gsuitesync/gapiauth.py @@ -0,0 +1,43 @@ +""" +Google API authentication. + +""" +import dataclasses +import logging +import typing + +from google.oauth2 import service_account + +from .config import ConfigurationDataclassMixin + + +LOG = logging.getLogger(__name__) + + +@dataclasses.dataclass +class Configuration(ConfigurationDataclassMixin): + """ + Configuration of Google API access credentials. + + """ + # Path to on-disk JSON credentials used when accessing the API. + credentials: str + + # Path to on-disk JSON credentials used when accessing the API in "read-only" mode. Use this if + # you want to have a separate "safe" service account which can only read data. If null, use the + # same credentials for reading and writing. + read_only_credentials: typing.Union[str, None] = None + + def load_credentials(self, *, read_only=True): + """ + Create a Google credentials object from the configuration. Use *read_only* to indicate if + read-only credentials are preferred. + + """ + credentials = self.credentials + if read_only and self.read_only_credentials is not None: + credentials = self.read_only_credentials + LOG.info('Using read-only credentials.') + + LOG.info('Loading Google account credentials from "%s"', credentials) + return service_account.Credentials.from_service_account_file(credentials) diff --git a/gsuitesync/gapidomain.py b/gsuitesync/gapidomain.py new file mode 100644 index 0000000000000000000000000000000000000000..467f2684e572e4fc13202918700cff03583a41cc --- /dev/null +++ b/gsuitesync/gapidomain.py @@ -0,0 +1,22 @@ +""" +Google Domain management. + +""" +import dataclasses + +from .config import ConfigurationDataclassMixin + + +@dataclasses.dataclass +class Configuration(ConfigurationDataclassMixin): + """ + Configuration for accessing the Google Domain. + + """ + # Name of the domain. (E.g. "example.com".) + name: str + + # Username within the GSuite for the user which has administration rights. Should be an e-mail + # style name. E.g. "super-admin@example.com". The service account credentials specified in the + # google_api.auth section are used to perform admin actions as this user. + admin_user: str diff --git a/gsuitesync/gapiutil.py b/gsuitesync/gapiutil.py new file mode 100644 index 0000000000000000000000000000000000000000..05a2a46c5a7878c09361c1d9fcce85d39dd6f88e --- /dev/null +++ b/gsuitesync/gapiutil.py @@ -0,0 +1,27 @@ +""" +Utility functions which should have been part of the Google API client. + +""" + + +def list_all(list_cb, *, page_size=500, items_key='items', **kwargs): + """ + Simple wrapper for Google Client SDK list()-style callables. Repeatedly fetches pages of + results merging all the responses together. Returns the merged "items" arrays from the + responses. The key used to get the "items" array from the response may be overridden via the + items_key argument. + + """ + # Loop while we wait for nextPageToken to be "none" + page_token = None + resources = [] + while True: + list_response = list_cb(pageToken=page_token, maxResults=page_size, **kwargs).execute() + resources.extend(list_response.get(items_key, [])) + + # Get the token for the next page + page_token = list_response.get('nextPageToken') + if page_token is None: + break + + return resources diff --git a/gsuitesync/ldap.py b/gsuitesync/ldap.py new file mode 100644 index 0000000000000000000000000000000000000000..2c11b245d32412898c728e9730235ddf052b6b8b --- /dev/null +++ b/gsuitesync/ldap.py @@ -0,0 +1,75 @@ +""" +Retrieving user information from an LDAP directory. + +""" +import collections +import dataclasses +import typing + +import ldap3 + +from .config import ConfigurationDataclassMixin + + +# User information we need to populate the Google user directory. +UserEntry = collections.namedtuple('UserEntry', 'uid cn sn displayName') + + +@dataclasses.dataclass +class Configuration(ConfigurationDataclassMixin): + """ + Configuration for accessing the LDAP directory. + + """ + host: str + + search_base: str + + eligible_user_filter: str + + managed_user_filter: typing.Union[str, None] = None + + def get_eligible_uids(self): + """ + Return a set containing all uids who are eligible to have a Google account. + + """ + return { + e['attributes']['uid'][0] + for e in self._search(search_filter=self.eligible_user_filter, attributes=['uid']) + } + + def get_managed_user_entries(self): + """ + Return a list containing all managed user entries as UserEntry instances. + + """ + search_filter = ( + self.managed_user_filter + if self.managed_user_filter is not None + else self.eligible_user_filter + ) + return [ + UserEntry( + uid=_extract(e, 'uid'), cn=_extract(e, 'cn'), sn=_extract(e, 'sn'), + displayName=_extract(e, 'displayName') + ) + for e in self._search( + search_filter=search_filter, attributes=['uid', 'cn', 'sn', 'displayName'] + ) + ] + + def _search(self, *, search_filter, attributes): + ldap_server = ldap3.Server(self.host) + with ldap3.Connection(ldap_server, auto_bind=True) as conn: + return conn.extend.standard.paged_search( + self.search_base, search_filter, paged_size=1000, attributes=attributes) + + +def _extract(entry, attr, *, default=''): + vs = entry['attributes'].get(attr, []) + if len(vs) == 0: + return default + if isinstance(vs, str): + return vs + return vs[0] diff --git a/gsuitesync/limits.py b/gsuitesync/limits.py new file mode 100644 index 0000000000000000000000000000000000000000..602451ebf17a6e216e9885cbaee1e9a9e8503f4e --- /dev/null +++ b/gsuitesync/limits.py @@ -0,0 +1,48 @@ +""" +Synchronisation limits. + +""" +import dataclasses +import numbers +import typing + +from . import config + + +@dataclasses.dataclass +class Configuration(config.ConfigurationDataclassMixin): + """ + Configuration for synchronisation limits. + + """ + # The abort_... settings below are safety limits and will abort the run if the limits are + # violated. They are there to define the "sane limits" for an update. + + # Refuse to perform sync if we are to "touch" more than this percentage of users. The + # percentage of users "touched" is calculated as + # + # (new google users + modified google users) / max(1, total google users) + # + # where "modified" includes metadata changes and suspension/restoration. As such this + # calculated percentage can be greater than 100. Set to null to have no limit. Default: null. + abort_user_change_percentage: typing.Union[None, numbers.Real] = None + + # The max_... settings below will not abort the run if the number of users affected is greater + # than the specified number. Instead the number of users affected is capped to that number. The + # selection of which users are included in the capped number is arbitrary. + + # Limit the number of new user creations per run. This is an absolute number. Set to None to + # have no limit. + max_new_users: typing.Union[None, numbers.Real] = None + + # Limit the number of user suspensions per run. This is an absolute number. Set to None to + # have no limit. + max_suspended_users: typing.Union[None, numbers.Real] = None + + # Limit the number of user un-suspensions (reactivations) per run. This is an absolute number. + # Set to None to have no limit. + max_reactivated_users: typing.Union[None, numbers.Real] = None + + # Limit the number of user metadata changes per run. This is an absolute number. Set to None to + # have no limit. + max_updated_users: typing.Union[None, numbers.Real] = None diff --git a/gsuitesync/naming.py b/gsuitesync/naming.py new file mode 100644 index 0000000000000000000000000000000000000000..a658610de1373c9b44d86f740d3f45830500a12f --- /dev/null +++ b/gsuitesync/naming.py @@ -0,0 +1,173 @@ +""" +Utilities for constructing human-friendly names. + +""" +import collections + + +# The human-friendly names constructed by get_names(). +Names = collections.namedtuple('Names', 'given_name family_name') + + +def get_names(*, uid, display_name=None, cn=None, sn=None): + """ + If we only have a uid, this is used for both given name and family name. + + >>> get_names(uid='spqr1') + Names(given_name='spqr1', family_name='spqr1') + >>> get_names(uid='spqr1', display_name='spqr1') + Names(given_name='spqr1', family_name='spqr1') + >>> get_names(uid='spqr1', display_name='spqr1', cn='spqr1', sn='spqr1') + Names(given_name='spqr1', family_name='spqr1') + >>> get_names(uid='spqr1', display_name='') + Names(given_name='spqr1', family_name='spqr1') + >>> get_names(uid='spqr1', sn='') + Names(given_name='spqr1', family_name='spqr1') + >>> get_names(uid='spqr1', cn='') + Names(given_name='spqr1', family_name='spqr1') + + "Odd" ASCII characters unsupported by Google are stripped out of names. + + >>> get_names(uid='spqr1', display_name='Stephen @**Quill-Roman**@') + Names(given_name='Stephen', family_name='Quill-Roman') + + Long names are truncated. + + >>> get_names(uid='spqr1', display_name='Stephen Quill-Roman' + 'X' * 200) + ... #doctest: +NORMALIZE_WHITESPACE + Names(given_name='Stephen', + family_name='Quill-RomanXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX') + + For compatibility with the existing authenticator, if we have common name and surname and the + common name ends with the surname, this is used to form the given names assuming there is some + string left. (Once we're happy with the sync, we should remove this.) + + >>> get_names(uid='spqr1', sn='Quill Roman', cn='Prof. S.P. Quill Roman') + Names(given_name='Prof. S.P.', family_name='Quill Roman') + + For similar compatibility reasons, if we have a cn, split it at the final space character + irrespective of display name. + + >>> get_names(uid='spqr1', cn='Prof. S.P. Quill Roman', display_name='Foo Bar') + Names(given_name='Prof. S.P. Quill', family_name='Roman') + >>> get_names( + ... uid='spqr1', sn='spqr1', display_name='Stephen P. Q. Roman', + ... cn='Prof. S.P.Q. Roman') + Names(given_name='Prof. S.P.Q.', family_name='Roman') + + (In the future we will probably remove the compatibility layer but not until we're happy with + the rest of the sync.) + + If we have display name and surname and the display name ends with the surname, this is used to + form the given names assuming there is some string left. + + >>> get_names(uid='spqr1', display_name='Stephen Quill Roman', sn='Quill Roman') + Names(given_name='Stephen', family_name='Quill Roman') + + If this didn't work but we have display name, split it at the final space. + + >>> get_names(uid='spqr1', display_name='Stephen Quill Roman', sn='Stephen Quill Roman') + Names(given_name='Stephen Quill', family_name='Roman') + >>> get_names(uid='spqr1', display_name='Stephen Quill Roman', cn='Stephen') + Names(given_name='Stephen Quill', family_name='Roman') + >>> get_names(uid='spqr1', display_name='Stephen Quill Roman') + Names(given_name='Stephen Quill', family_name='Roman') + + Support Wookey. + + >>> get_names(uid='spqr1', display_name='Wookey') + Names(given_name='Wookey', family_name='spqr1') + >>> get_names(uid='spqr1', sn='Wookey') + Names(given_name='spqr1', family_name='Wookey') + >>> get_names(uid='spqr1', cn='Wookey') + Names(given_name='Wookey', family_name='spqr1') + + """ + # If any of display name, common name or surname is the same as the uid, proceed as if it were + # unset. Trim any leading/trailing whitespace at the same time. + cn = cn.strip() if cn is not None and cn != uid else None + sn = sn.strip() if sn is not None and sn != uid else None + display_name = ( + display_name.strip() + if display_name is not None and display_name != uid else None + ) + + # If any of cn, sn or display_name are blank, proceed as it they're not set. + cn = cn if cn != '' else None + sn = sn if sn != '' else None + display_name = display_name if display_name != '' else None + + # Function to construct return value from family name and given name. Google names can't be + # longer than 60 characters so truncate them after cleaning. + def _make_ret(*, family_name, given_name): + return Names(family_name=_clean(family_name)[:60], given_name=_clean(given_name)[:40]) + + # If we have a sn and cn and the cn ends with sn, split out the sn. + if cn is not None and sn is not None and cn.endswith(sn): + given_name = cn[:-len(sn)].strip() + if given_name != '': + return _make_ret(family_name=sn, given_name=given_name) + + # If we have cn, split at space and see if we have two parts. + if cn is not None: + components = cn.split() + if len(components) > 0: + family_name = components[-1] + given_name = ' '.join(components[:-1]) + if given_name != '' and family_name != '': + return _make_ret(family_name=family_name, given_name=given_name) + + # If we have a sn and display name and the display name ends with sn, split out the sn. + if display_name is not None and sn is not None and display_name.endswith(sn): + given_name = display_name[:-len(sn)].strip() + if given_name != '': + return _make_ret(family_name=sn, given_name=given_name) + + # If we have the display name, split at space and see if we have two parts. + if display_name is not None: + components = display_name.split() + if len(components) > 0: + family_name = components[-1] + given_name = ' '.join(components[:-1]) + if given_name != '' and family_name != '': + return _make_ret(family_name=family_name, given_name=given_name) + + # Support Wookey. + if display_name is not None and ' ' not in display_name: + return _make_ret(family_name=uid, given_name=display_name) + if sn is not None and ' ' not in sn: + return _make_ret(family_name=sn, given_name=uid) + if cn is not None and ' ' not in cn: + return _make_ret(family_name=uid, given_name=cn) + + # Give up and return uid for both fields + return _make_ret(family_name=uid, given_name=uid) + + +def _clean(s): + """ + Clean any "bad characters" in names. This pattern is based on the one used by the + legacy Google authenticator which has this comment: + + Google API doesn't like _some_ characters. The 'documentation' + (http://www.google.com/support/a/bin/answer.py?answer=33386) says "First and last names + support unicode/UTF-8 characters, and may contain spaces, letters (a-z), numbers (0-9), + dashes (-), forward slashes (/), and periods (.)", which makes no sence [sic]. + Experimentation suggests it chokes on '<', '>', and '=', but doesn't mind, e.g. cyrilic + characters. Compromise by filtering out "!"#$%&'()*+,:;<=>?@[\\]^_`{|}~" - i.e. all the + 'odd' ASCII characters other than the ones explicitly supported. + + We change this to allow "'" since plenty of names have this character. (E.g. "O'Reilly", + "D'Angelo", etc.) + + >>> _clean('ab@c') + 'abc' + >>> _clean('a "b" c') + 'a b c' + + """ + return ''.join(c for c in s if c not in _CLEAN_BAD_CHARS) + + +# Characters stripped by _clean. Present as a constant to avoid re-creating it. +_CLEAN_BAD_CHARS = '!"#$%&()*+,:;<=>?@[\\]^_`{|}~' diff --git a/gsuitesync/sync.py b/gsuitesync/sync.py new file mode 100644 index 0000000000000000000000000000000000000000..be993173e94e4964cf28598e07b74572a4c13dc1 --- /dev/null +++ b/gsuitesync/sync.py @@ -0,0 +1,358 @@ +""" +Synchronise Google Directory with a local LDAP directory. + +""" +import crypt +import dataclasses +import itertools +import logging +import re +import secrets +import typing + +from googleapiclient import discovery + +from . import config +from . import gapiauth +from . import gapidomain +from . import gapiutil +from . import ldap +from . import limits +from . import naming + +LOG = logging.getLogger(__name__) + +# Scopes required to perform read-only actions. +READ_ONLY_SCOPES = [ + 'https://www.googleapis.com/auth/admin.directory.user.readonly', +] + +# Scoped *in addition to READ_ONLY_SCOPES* required to perform a full update. +WRITE_SCOPES = [ + 'https://www.googleapis.com/auth/admin.directory.user', +] + + +@dataclasses.dataclass +class Configuration(config.ConfigurationDataclassMixin): + # A regular expression which is used to match the organization unit path for Google users who + # should be excluded from the list returned by Google. Those users do not exist for the + # purposes of the rest of the sync and so if they appear in the list of managed users this + # script will attempt to re-add them and fail in the process. Use this setting for users who + # are managed completely outside of this script. + ignore_google_org_unit_path_regex: typing.Union[str, None] = None + + +def sync(configuration, *, read_only=True): + """Perform sync given configuration dictionary.""" + if read_only: + LOG.info('Performing synchronisation in READ ONLY mode.') + else: + LOG.info('Performing synchronisation in WRITE mode.') + + # Parse configuration + sync_config = Configuration.from_dict(configuration.get('sync', {})) + gapi_auth_config = gapiauth.Configuration.from_dict( + configuration.get('google_api', {}).get('auth', {})) + gapi_domain_config = gapidomain.Configuration.from_dict( + configuration.get('google_domain', {})) + ldap_config = ldap.Configuration.from_dict(configuration.get('ldap', {})) + limits_config = limits.Configuration.from_dict(configuration.get('limits', {})) + + # Load appropriate Google credentials. + creds = ( + gapi_auth_config.load_credentials(read_only=read_only) + .with_scopes(READ_ONLY_SCOPES + ([] if read_only else WRITE_SCOPES)) + .with_subject(gapi_domain_config.admin_user) + ) + + # Get a set containing all CRSids. These are all the people who are eligible to be in our + # GSuite instance. If a user is in GSuite and is *not* present in this list then they are + # suspended. + LOG.info('Reading eligible user entries from LDAP') + eligible_uids = ldap_config.get_eligible_uids() + LOG.info('Total LDAP entries: %s', len(eligible_uids)) + + # Get a list of managed users. These are all the people who match the "managed_user_filter" in + # the LDAP settings. + LOG.info('Reading managed user entries from LDAP') + managed_user_entries = ldap_config.get_managed_user_entries() + + # Form a mapping from uid to managed user. + managed_user_entries_by_uid = {u.uid: u for u in managed_user_entries} + + # Form a set of all *managed user* uids + managed_user_uids = set(managed_user_entries_by_uid.keys()) + LOG.info('Total managed user entries: %s', len(managed_user_uids)) + + # Sanity check: the managed users should be a subset of the eligible ones. + if len(managed_user_uids - eligible_uids) != 0: + raise RuntimeError('Sanity check failed: some managed uids were not in the eligible set') + + # Build the directory service using Google API discovery. + directory_service = discovery.build('admin', 'directory_v1', credentials=creds) + + # Retrieve information on all users excluding domain admins. + LOG.info('Getting information on Google domain users') + fields = [ + 'id', 'isAdmin', 'orgUnitPath', 'primaryEmail', 'suspended', 'suspensionReason', + 'name(givenName, familyName)', + ] + all_google_users = gapiutil.list_all( + directory_service.users().list, items_key='users', domain=gapi_domain_config.name, + query='isAdmin=false', fields='nextPageToken,users(' + ','.join(fields) + ')', + ) + + # Strip any "to be ignored" users out of the results. + if sync_config.ignore_google_org_unit_path_regex is not None: + LOG.info( + 'Ignoring users whose organization unit path matches %r', + sync_config.ignore_google_org_unit_path_regex) + regex = re.compile(sync_config.ignore_google_org_unit_path_regex) + all_google_users = [ + u for u in all_google_users if not regex.match(u['orgUnitPath']) + ] + + # Sanity check. There should be no admins in the returned results. + if any(u.get('isAdmin', False) for u in all_google_users): + raise RuntimeError('Sanity check failed: admin users in user list') + + # Form a mapping from uid to Google user. We form the uid by splitting out the local-part of + # the email address. + all_google_users_by_uid = {u['primaryEmail'].split('@')[0]: u for u in all_google_users} + + # Form a set of all Google-side uids. The all_google_uids set is all users including the + # suspended ones and the suspended_google_uids set is only the suspended users. Non suspended + # users are therefore all_google_uids - suspended_google_uids. + all_google_uids = set(all_google_users_by_uid.keys()) + suspended_google_uids = {uid for uid, u in all_google_users_by_uid.items() if u['suspended']} + + # Sanity check. We should not have lost anyone. (I.e. the uid should be unique.) + if len(all_google_uids) != len(all_google_users): + raise RuntimeError('Sanity check failed: user list changed length') + + # Log some stats. + LOG.info('Total Google users: %s', len(all_google_uids)) + LOG.info( + 'Suspended Google users: %s', sum(1 if u['suspended'] else 0 for u in all_google_users)) + + # For each user which exists in Google or the managed user set which is eligible, determine if + # they need updating/creating. If so, record a patch/insert for the user. + LOG.info('Calculating updates...') + google_user_updates = {} + google_user_creations = {} + for idx, (uid, managed_user_entry) in enumerate(managed_user_entries_by_uid.items()): + # Show progress + if (idx + 1) % 5000 == 0: + LOG.info('Processed %s/%s...', idx+1, len(managed_user_entries_by_uid)) + + # Heuristically determine the given and family names. + names = naming.get_names( + uid=uid, display_name=managed_user_entry.displayName, cn=managed_user_entry.cn, + sn=managed_user_entry.sn) + + # Form expected user resource fields. + expected_google_user = { + 'name': { + 'givenName': names.given_name, + 'familyName': names.family_name, + }, + } + + # Find existing Google user (if any). + existing_google_user = all_google_users_by_uid.get(uid) + + if existing_google_user is not None: + # See if we need to change the existing user + # Unless anything needs changing, the patch is empty. + patch = {} + + # Determine how to patch user's name. + google_user_name = existing_google_user.get('name', {}) + patch_name = {} + if google_user_name.get('givenName') != expected_google_user['name']['givenName']: + patch_name['givenName'] = names.given_name + if google_user_name.get('familyName') != expected_google_user['name']['familyName']: + patch_name['familyName'] = names.family_name + if len(patch_name) > 0: + patch['name'] = patch_name + + # Only record non-empty patches. + if len(patch) > 0: + google_user_updates[uid] = patch + else: + # No existing Google user. Record the new resource. Generate a new user password and + # send Google the hash. It doesn't matter what this password is since we never have the + # user log in with it. For password-only applications the user can make use of an + # application-specific password. + new_user = { + **{ + 'primaryEmail': f'{uid}@{gapi_domain_config.name}', + }, + **expected_google_user, + } + google_user_creations[uid] = new_user + + # Form a set of all the uids which need patching. + uids_to_update = set(google_user_updates.keys()) + LOG.info('Number of existing users to update: %s', len(uids_to_update)) + + # Form a set of all the uids which need adding. + uids_to_add = set(google_user_creations.keys()) + LOG.info('Number of users to add: %s', len(uids_to_add)) + + # Form a set of all uids which need reactivating. We reactive users who are in the managed user + # list *and* the suspended user list. + uids_to_reactivate = suspended_google_uids & managed_user_uids + LOG.info('Number of users to reactivate: %s', len(uids_to_reactivate)) + + # Form a set of all uids which should be suspended. This is all the unsuspended Google uids + # which do not appear in our eligible user list. + uids_to_suspend = (all_google_uids - suspended_google_uids) - eligible_uids + LOG.info('Number of users to suspend: %s', len(uids_to_suspend)) + + # Calculate percentage change. + user_change_percentage = 100. * ( + len(uids_to_add | uids_to_update | uids_to_reactivate | uids_to_suspend) + / + max(1, len(all_google_uids)) + ) + LOG.info('Configuration will modify %.2f%% of users', user_change_percentage) + + # Enforce percentage change sanity check. + if (limits_config.abort_user_change_percentage is not None and + user_change_percentage > limits_config.abort_user_change_percentage): + LOG.error( + 'Modification of %.2f%% of users is greater than limit of %.2f%%. Aborting.', + user_change_percentage, limits_config.abort_user_change_percentage + ) + raise RuntimeError('Aborting due to large user change percentage') + + # Cap maximum size of various operations. + if limits_config.max_new_users is not None and len(uids_to_add) > limits_config.max_new_users: + uids_to_add = _limit(uids_to_add, limits_config.max_new_users) + LOG.info('Capped number of new users to %s', len(uids_to_add)) + if (limits_config.max_suspended_users is not None and + len(uids_to_suspend) > limits_config.max_suspended_users): + uids_to_suspend = _limit(uids_to_suspend, limits_config.max_suspended_users) + LOG.info('Capped number of users to suspend to %s', len(uids_to_suspend)) + if (limits_config.max_reactivated_users is not None and + len(uids_to_reactivate) > limits_config.max_reactivated_users): + uids_to_reactivate = _limit(uids_to_reactivate, limits_config.max_reactivated_users) + LOG.info('Capped number of users to reactivate to %s', len(uids_to_reactivate)) + if (limits_config.max_updated_users is not None and + len(uids_to_update) > limits_config.max_updated_users): + uids_to_update = _limit(uids_to_update, limits_config.max_updated_users) + LOG.info('Capped number of users to update to %s', len(uids_to_update)) + + # A generator which will generate patch() and insert() calls to the directory service to + # perform the actions required + def api_requests(): + # Update existing users. + user_updates = {uid: google_user_updates[uid] for uid in uids_to_update} + for uid, update in user_updates.items(): + google_id = all_google_users_by_uid[uid]['id'] + LOG.info('Update user "%s": "%r"', uid, update) + yield directory_service.users().patch(userKey=google_id, body=update) + + # Suspend old users + for uid in uids_to_suspend: + google_id = all_google_users_by_uid[uid]['id'] + LOG.info('Suspending user: "%s"', uid) + yield directory_service.users().patch(userKey=google_id, body={'suspended': True}) + + # Reactivate returning users + for uid in uids_to_reactivate: + google_id = all_google_users_by_uid[uid]['id'] + LOG.info('Reactivating user: "%s"', uid) + yield directory_service.users().patch(userKey=google_id, body={'suspended': False}) + + # Create new users + for uid in uids_to_add: + # Generate a random password which is thrown away. + new_user = {**{ + 'hashFunction': 'crypt', + 'password': crypt.crypt(secrets.token_urlsafe(), crypt.METHOD_SHA512), + }, **google_user_creations[uid]} + redacted_user = {**new_user, **{'password': 'REDACTED'}} + LOG.info('Adding user "%s": %s', uid, redacted_user) + yield directory_service.users().insert(body=new_user) + + # Make an chunked iterator of requests to the directory API. The Directory API supports a + # maximum batch size of 1000. See: + # https://developers.google.com/admin-sdk/directory/v1/guides/batch + for request_batch in _grouper(api_requests(), n=1000): + # Form batch request. + batch = directory_service.new_batch_http_request() + for request in request_batch: + batch.add(request, callback=_handle_batch_response) + + # Execute the batch request if not in read only mode. Otherwise log that we would have. + if not read_only: + LOG.info('Issuing batch request to Google.') + batch.execute() + else: + LOG.info('Not issuing batch request in read-only mode.') + + +def _handle_batch_response(request_id, response, exception): + if exception is not None: + LOG.error('Error performing request: %s', exception) + LOG.error('Response: %r', response) + + +def _limit(s, limit): + """ + Given a set, s, and a numeric limit, return a set which has no more than *limit* elements. The + exact set of elements retained is not specified. + + >>> s = set('ABCDEFGHIJKLMNOPQ') + >>> len(s) > 5 + True + >>> len(_limit(s, 5)) == 5 + True + >>> len(_limit(s, 500)) == len(s) + True + + All elements of the returned set are taken from input set. + + >>> s_prime = _limit(s, 5) + >>> s_prime - s + set() + + """ + return {e for _, e in itertools.takewhile(lambda p: p[0] < limit, enumerate(s))} + + +def _grouper(iterable, *, n): + """ + Group an iterable into chunks of at most *n* elements. A generator which yields iterables + representing slices of *iterable*. + + >>> [list(i) for i in _grouper('ABCDEFGH', n=3)] + [['A', 'B', 'C'], ['D', 'E', 'F'], ['G', 'H']] + >>> def generator(stop): + ... for x in range(stop): + ... yield x + >>> [list(i) for i in _grouper(generator(10), n=3)] + [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] + >>> [list(i) for i in _grouper(generator(12), n=3)] + [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]] + + The implementation of this function attempts to be efficient; the chunks are iterables which + are generated on demand rather than being constructed first. Hence this function can deal with + iterables which would fill memory if intermediate chunks were stored. + + >>> i = _grouper(generator(100000000000000000000), n=1000000000000000) + >>> next(next(i)) + 0 + + """ + it = iter(iterable) + while True: + next_chunk_it = itertools.islice(it, n) + try: + first = next(next_chunk_it) + except StopIteration: + return + yield itertools.chain((first,), next_chunk_it) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..98401d8d3a6c26c452d4444837b8c6895a9b8380 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +PyYAML +docopt +google-api-python-client +google-auth +ldap3 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..5be311a3d0b49298233e6ae10563453847852123 --- /dev/null +++ b/setup.py @@ -0,0 +1,33 @@ +import os + +from setuptools import setup, find_packages + + +def load_requirements(): + """ + Load requirements file and return non-empty, non-comment lines with leading and trailing + whitespace stripped. + """ + with open(os.path.join(os.path.dirname(__file__), 'requirements.txt')) as f: + return [ + line.strip() for line in f + if line.strip() != '' and not line.strip().startswith('#') + ] + + +setup( + name='gsuitesync', + version='0.9.0', + packages=find_packages(), + install_requires=load_requirements(), + entry_points={ + 'console_scripts': [ + 'gsuitesync=gsuitesync:main', + ] + }, + extras_require={ + ':python_version < "3.7"': [ + 'dataclasses', + ], + } +) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000000000000000000000000000000000..f9a34d9f942685cf7984b8026ad4de10ee8a9f93 --- /dev/null +++ b/tox.ini @@ -0,0 +1,63 @@ +# Tox runner configuration +# +# The following optional environment variables can change behaviour. See the +# comments where they are used for more information. +# +# - TOXINI_ARTEFACT_DIR +# - TOXINI_FLAKE8_VERSION +# - TOXINI_WORK_DIR +# +[tox] +# Envs which should be run by default. +envlist=flake8,py3 +# Allow overriding toxworkdir via environment variable +toxworkdir={env:TOXINI_WORK_DIR:{toxinidir}/.tox} +# Avoid .egg-info directories +skipsdist=True + +# The "_vars" section is ignored by tox but we place some useful shared +# variables in it to avoid needless repetition. +[_vars] +# Where to write build artefacts. We default to the "build" directory in the +# tox.ini file's directory. Override with the TOXINI_ARTEFACT_DIR environment +# variable. +build_root={env:TOXINI_ARTEFACT_DIR:{toxinidir}/build} + +[testenv] +# Additional dependencies +deps= + . + coverage + pytest + pytest-cov +# Which environment variables should be passed into the environment. +passenv= +# Allow people to override the coverage report location should they so wish. + COVERAGE_FILE +# Location of the coverage.xml file + COVERAGE_XML_FILE +# How to run the test suite. Note that arguments passed to tox are passed on to +# the test command. +commands= + pytest --doctest-modules --cov={toxinidir} --junitxml={[_vars]build_root}/{envname}/junit.xml + coverage html --directory {[_vars]build_root}/{envname}/htmlcov/ + coverage xml -o {env:COVERAGE_XML_FILE:{[_vars]build_root}/{envname}/coverage.xml} +# Allow sitepackages setting to be overridden via TOX_SITEPACKAGES environment +# variable. The tox container uses this to avoid re-installing the same packages +# over and over again. +sitepackages={env:TOXINI_SITEPACKAGES:False} + +[testenv:py3] +basepython=python3 + +# Check for PEP8 violations +[testenv:flake8] +basepython=python3 +deps= +# We specify a specific version of flake8 to avoid introducing "false" +# regressions when new checks are introduced. The version of flake8 used may +# be overridden via the TOXINI_FLAKE8_VERSION environment variable. + flake8=={env:TOXINI_FLAKE8_VERSION:3.6.0} +commands= + flake8 --version + flake8 .