diff --git a/Dockerfile b/Dockerfile index 694d30c0f6eea52bb60036e7c30e40a6bbfbbe01..d210cce417d1552a2eee66d210eabadc023745cf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ # This Dockerfile is intended only to support the Auto-DevOps pipeline on GitLab. # It's not intended to package the application. -FROM uisautomation/python:3.7-alpine +FROM registry.gitlab.developers.cam.ac.uk/uis/devops/infra/dockerimages/python:3.7-alpine WORKDIR /usr/src/app diff --git a/README.md b/README.md index 9ceb4cc74ea7f9a61d365dda64e6fdbae1089c69..798956fd5326f474ace3464489677bb2ec34c0c3 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,10 @@ file in the following places in the following order: The first located file is used. +> **Note:** The `crypt` python library behaves differently on MacOS than Linux +> and fails to create acceptable passwords for new Google accounts. +> Build the docker image and run gsuitesync via it to overcome this. + ## Installation The command-line tool can be installed directly from the git repository: diff --git a/configuration-example.yaml b/configuration-example.yaml index dba8141fb3eb255ec2511ec3a7cee68ce5d5a308..c8d8004749dea384b97d1e81a5682a653dbbff72 100644 --- a/configuration-example.yaml +++ b/configuration-example.yaml @@ -151,6 +151,9 @@ ldap: # use SSL when connecting to the LDAP server, and will attempt to # authenticate with these credentials. # + # Username needs to be the full DN of the group, e.g. + # groupid=123456,ou=groups,o=example-corps,dc=example,dc=com + # # The username and password properties should _not_ be specified when running # the sync tool inside the CUDN (which includes running in the CI pipeline). username: null diff --git a/gsuitesync/config.py b/gsuitesync/config.py deleted file mode 100644 index 4f51965719b94eccfcc40a527e21cdf087f4885f..0000000000000000000000000000000000000000 --- a/gsuitesync/config.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Utilities for parsing configuration files. - -""" -import dataclasses -import logging -import os - -import yaml - -LOG = logging.getLogger(__name__) - - -class ConfigurationError(RuntimeError): - """ - Base class for all configuration errors. - - """ - - -class ConfigurationNotFound(ConfigurationError): - """ - A suitable configuration could not be located. - - """ - def __init__(self): - return super().__init__('Could not find any configuration file') - - -def load_configuration(location=None): - """ - Load configuration and return a :py:class:`Configuration` instance. Pass a non-None location to - override the default search path. - - :raises: ConfigurationError if the configuration could not be loaded. - - """ - if location is not None: - paths = [location] - else: - if 'GSUITESYNC_CONFIGURATION' in os.environ: - paths = [os.environ['GSUITESYNC_CONFIGURATION']] - else: - paths = [] - paths.extend([ - os.path.join(os.getcwd(), 'gsuitesync.yaml'), - os.path.expanduser('~/.gsuitesync/configuration.yaml'), - '/etc/gsuitesync/configuration.yaml' - ]) - - valid_paths = [path for path in paths if os.path.isfile(path)] - - if len(valid_paths) == 0: - LOG.error('Could not find configuration file. Tried:') - for path in paths: - LOG.error('"%s"', path) - raise ConfigurationNotFound() - - with open(valid_paths[0]) as f: - return yaml.safe_load(f) - - -class ConfigurationDataclassMixin: - """ - Mixin class for dataclass which adds a "from_dict" member which will construct an instance from - a dictionary. Fields which have no default value become required fields. - - """ - - @classmethod - def from_dict(cls, dict_): - """ - Construct an instance from a dict. - - """ - field_names = {field.name for field in dataclasses.fields(cls)} - required_field_names = { - field.name for field in dataclasses.fields(cls) - if field.default is dataclasses.MISSING - } - - for key in dict_.keys(): - if key not in field_names: - raise ValueError(f'Unknown configuration key: {key}') - - for key in required_field_names: - if key not in dict_: - raise ValueError(f'{key}: required field not set') - - return cls(**dict_) diff --git a/gsuitesync/config/__init__.py b/gsuitesync/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a1cc685e8a4f4bf9b05de88f7c67c6db27e7d610 --- /dev/null +++ b/gsuitesync/config/__init__.py @@ -0,0 +1,5 @@ +""" +Configuration definitions + +""" +from .utils import load_configuration, parse_configuration # noqa: F401 diff --git a/gsuitesync/config/exceptions.py b/gsuitesync/config/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..a6cb9b50a7ae57eeae7e3095886e3526320df0a5 --- /dev/null +++ b/gsuitesync/config/exceptions.py @@ -0,0 +1,20 @@ +""" +Configuration Exceptions + +""" + + +class ConfigurationError(RuntimeError): + """ + Base class for all configuration errors. + + """ + + +class ConfigurationNotFound(ConfigurationError): + """ + A suitable configuration could not be located. + + """ + def __init__(self): + return super().__init__('Could not find any configuration file') diff --git a/gsuitesync/config/gapiauth.py b/gsuitesync/config/gapiauth.py new file mode 100644 index 0000000000000000000000000000000000000000..11b62c2c72bff7004c25e6f63f52f0ac384b4f6e --- /dev/null +++ b/gsuitesync/config/gapiauth.py @@ -0,0 +1,27 @@ +""" +Google API authentication. + +""" +import dataclasses +import logging +import typing + +from .mixin import ConfigurationDataclassMixin + + +LOG = logging.getLogger(__name__) + + +@dataclasses.dataclass +class Configuration(ConfigurationDataclassMixin): + """ + Configuration of Google API access credentials. + + """ + # Path to on-disk JSON credentials used when accessing the API. + credentials: str + + # Path to on-disk JSON credentials used when accessing the API in "read-only" mode. Use this if + # you want to have a separate "safe" service account which can only read data. If null, use the + # same credentials for reading and writing. + read_only_credentials: typing.Union[str, None] = None diff --git a/gsuitesync/gapidomain.py b/gsuitesync/config/gapidomain.py similarity index 95% rename from gsuitesync/gapidomain.py rename to gsuitesync/config/gapidomain.py index a4dc18c3d12bf712f4ed9c7cc252104eb0aa51eb..60389b07a7a7c9dc724779a68ec7928930310057 100644 --- a/gsuitesync/gapidomain.py +++ b/gsuitesync/config/gapidomain.py @@ -5,7 +5,7 @@ Google Domain management. import dataclasses import typing -from .config import ConfigurationDataclassMixin +from .mixin import ConfigurationDataclassMixin @dataclasses.dataclass diff --git a/gsuitesync/config/ldap.py b/gsuitesync/config/ldap.py new file mode 100644 index 0000000000000000000000000000000000000000..75fb5372670066e3c96bfa71c4b5a6deaa7c4182 --- /dev/null +++ b/gsuitesync/config/ldap.py @@ -0,0 +1,39 @@ +""" +Retrieving user information from an LDAP directory. + +""" +import dataclasses +import typing + +from .mixin import ConfigurationDataclassMixin + + +@dataclasses.dataclass +class Configuration(ConfigurationDataclassMixin): + """ + Configuration for accessing the LDAP directory. + + """ + host: str + + user_search_base: str + + group_search_base: str + + inst_search_base: str + + eligible_user_filter: str + + eligible_group_filter: str + + eligible_inst_filter: str + + username: str = None + + password: str = None + + managed_user_filter: typing.Union[str, None] = None + + managed_group_filter: typing.Union[str, None] = None + + managed_inst_filter: typing.Union[str, None] = None diff --git a/gsuitesync/limits.py b/gsuitesync/config/limits.py similarity index 97% rename from gsuitesync/limits.py rename to gsuitesync/config/limits.py index d3fa24c798545861af5a66966edf1ff214e6bdf4..980c430e24e4d4c8b8bbab527f65cfc79a1846e8 100644 --- a/gsuitesync/limits.py +++ b/gsuitesync/config/limits.py @@ -6,11 +6,11 @@ import dataclasses import numbers import typing -from . import config +from .mixin import ConfigurationDataclassMixin @dataclasses.dataclass -class Configuration(config.ConfigurationDataclassMixin): +class Configuration(ConfigurationDataclassMixin): """ Configuration for synchronisation limits. diff --git a/gsuitesync/config/mixin.py b/gsuitesync/config/mixin.py new file mode 100644 index 0000000000000000000000000000000000000000..0d54dadf2ed044edd36b4f7c3a4b41a4c2a733cb --- /dev/null +++ b/gsuitesync/config/mixin.py @@ -0,0 +1,30 @@ +""" +Mixin class for dataclass which adds a "from_dict" member which will construct an instance from +a dictionary. Fields which have no default value become required fields. + +""" +import dataclasses + + +class ConfigurationDataclassMixin: + @classmethod + def from_dict(cls, dict_): + """ + Construct an instance from a dict. + + """ + field_names = {field.name for field in dataclasses.fields(cls)} + required_field_names = { + field.name for field in dataclasses.fields(cls) + if field.default is dataclasses.MISSING + } + + for key in dict_.keys(): + if key not in field_names: + raise ValueError(f'Unknown configuration key: {key}') + + for key in required_field_names: + if key not in dict_: + raise ValueError(f'{key}: required field not set') + + return cls(**dict_) diff --git a/gsuitesync/config/sync.py b/gsuitesync/config/sync.py new file mode 100644 index 0000000000000000000000000000000000000000..f19771a17ca84a65361279f2c56779cf1e82f915 --- /dev/null +++ b/gsuitesync/config/sync.py @@ -0,0 +1,56 @@ +""" +Synchronisation configuration. + +""" +import dataclasses +import numbers +import typing + +from .mixin import ConfigurationDataclassMixin + + +@dataclasses.dataclass +class Configuration(ConfigurationDataclassMixin): + # A regular expression which is used to match the organization unit path for Google users who + # should be excluded from the list returned by Google. Those users do not exist for the + # purposes of the rest of the sync and so if they appear in the list of managed users this + # script will attempt to re-add them and fail in the process. Use this setting for users who + # are managed completely outside of this script. + ignore_google_org_unit_path_regex: typing.Union[str, None] = None + + # The organization unit path in which new accounts are placed + new_user_org_unit_path: str = '/' + + # Suffix appended to the names of groups created in Google. The Google group name will be + # "{groupName}{group_name_suffix}", where {groupName} is the Lookup group name. + group_name_suffix: str = ' from lookup.cam.ac.uk' + + # Settings to be applied to groups in Google. These settings are applied to both new and + # existing groups imported from Lookup. + # See https://developers.google.com/admin-sdk/groups-settings/v1/reference/groups#json + group_settings: dict = dataclasses.field(default_factory=lambda: { + 'whoCanJoin': 'INVITED_CAN_JOIN', + 'whoCanViewMembership': 'ALL_IN_DOMAIN_CAN_VIEW', + 'whoCanViewGroup': 'ALL_MEMBERS_CAN_VIEW', + 'whoCanPostMessage': 'ALL_IN_DOMAIN_CAN_POST', + 'allowWebPosting': 'false', + 'messageModerationLevel': 'MODERATE_ALL_MESSAGES', + 'includeInGlobalAddressList': 'true', + 'whoCanLeaveGroup': 'NONE_CAN_LEAVE', + 'whoCanContactOwner': 'ALL_MANAGERS_CAN_CONTACT', + 'whoCanModerateMembers': 'OWNERS_ONLY', + 'whoCanDiscoverGroup': 'ALL_IN_DOMAIN_CAN_DISCOVER', + }) + + # Inter-batch delay in seconds. This is useful to avoid hitting Google rate limits. + inter_batch_delay: numbers.Real = 5 + + # Batch size for Google API calls. Google supports batching requests together into one API + # call. + batch_size: int = 50 + + # Number of times to retry HTTP requests if a HTTP failure response is received + http_retries: int = 5 + + # Delay in seconds between retying a request that has failed + http_retry_delay: numbers.Real = 5 diff --git a/gsuitesync/config/utils.py b/gsuitesync/config/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a12d50b293ce87dad6c6ac5ce84c91a5dfb8b075 --- /dev/null +++ b/gsuitesync/config/utils.py @@ -0,0 +1,59 @@ +import logging +import os +import yaml + +from .exceptions import ConfigurationNotFound + +# Configuration declarations +from . import gapiauth, gapidomain, ldap, limits, sync + +LOG = logging.getLogger(__name__) + + +def load_configuration(location=None): + """ + Load configuration and return a :py:class:`Configuration` instance. Pass a non-None location to + override the default search path. + + :raises: ConfigurationError if the configuration could not be loaded. + + """ + if location is not None: + paths = [location] + else: + if 'GSUITESYNC_CONFIGURATION' in os.environ: + paths = [os.environ['GSUITESYNC_CONFIGURATION']] + else: + paths = [] + paths.extend([ + os.path.join(os.getcwd(), 'gsuitesync.yaml'), + os.path.expanduser('~/.gsuitesync/configuration.yaml'), + '/etc/gsuitesync/configuration.yaml' + ]) + + valid_paths = [path for path in paths if os.path.isfile(path)] + + if len(valid_paths) == 0: + LOG.error('Could not find configuration file. Tried:') + for path in paths: + LOG.error('"%s"', path) + raise ConfigurationNotFound() + + with open(valid_paths[0]) as f: + return yaml.safe_load(f) + + +def parse_configuration(configuration): + """ + Parses the multiple parts of configuration using appropriate Configuration classes. + Returns a dict containing parsed parts of configuration. + + """ + return { + 'sync': sync.Configuration.from_dict(configuration.get('sync', {})), + 'gapi_domain': gapidomain.Configuration.from_dict(configuration.get('google_domain', {})), + 'ldap': ldap.Configuration.from_dict(configuration.get('ldap', {})), + 'limits': limits.Configuration.from_dict(configuration.get('limits', {})), + 'gapi_auth': gapiauth.Configuration.from_dict( + configuration.get('google_api', {}).get('auth', {})), + } diff --git a/gsuitesync/gapiauth.py b/gsuitesync/gapiauth.py deleted file mode 100644 index 436e7696a9562658888d450f37b2ba1dd3bd74ba..0000000000000000000000000000000000000000 --- a/gsuitesync/gapiauth.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Google API authentication. - -""" -import dataclasses -import logging -import typing - -from google.oauth2 import service_account - -from .config import ConfigurationDataclassMixin - - -LOG = logging.getLogger(__name__) - - -@dataclasses.dataclass -class Configuration(ConfigurationDataclassMixin): - """ - Configuration of Google API access credentials. - - """ - # Path to on-disk JSON credentials used when accessing the API. - credentials: str - - # Path to on-disk JSON credentials used when accessing the API in "read-only" mode. Use this if - # you want to have a separate "safe" service account which can only read data. If null, use the - # same credentials for reading and writing. - read_only_credentials: typing.Union[str, None] = None - - def load_credentials(self, *, read_only=True): - """ - Create a Google credentials object from the configuration. Use *read_only* to indicate if - read-only credentials are preferred. - - """ - credentials = self.credentials - if read_only and self.read_only_credentials is not None: - credentials = self.read_only_credentials - LOG.info('Using read-only credentials.') - - LOG.info('Loading Google account credentials from "%s"', credentials) - return service_account.Credentials.from_service_account_file(credentials) diff --git a/gsuitesync/gapiutil.py b/gsuitesync/gapiutil.py index abe46bfac11ebf6e2d5ed53864c91e36d35566e7..94f43854ebddfacc2cbf4ebd88bc613c7f176a74 100644 --- a/gsuitesync/gapiutil.py +++ b/gsuitesync/gapiutil.py @@ -3,6 +3,7 @@ Utility functions which should have been part of the Google API client. """ import logging +import itertools from googleapiclient.errors import HttpError from time import sleep @@ -55,7 +56,7 @@ def list_all_in_list(directory_service, list_cb, *, item_ids=[], id_key='key', b the "list_cb" Google API method for each item in the "item_ids" list, repeatedly fetching pages of results for each item and merging them together. The key used to identify the original items in Google is specified by the "id_key" argument. Returns a dictionary mapping - the orginal item IDs to the merged "items" arrays from the responses for each item. + the original item IDs to the merged "items" arrays from the responses for each item. This is equivalent to calling list_all() for each item in the "item_ids" list, and collecting all the results in a dictionary, except that it uses the Google batch processing API to reduce @@ -179,3 +180,79 @@ def get_all_in_list(directory_service, get_cb, *, item_ids=[], id_key='key', bat break return resources + + +def process_requests(service, requests, sync_config, read_only=True): + """ + Process an iterable list of requests to the specified Google service in batches. + These APIs support a maximum batch size of 1000. See: + https://developers.google.com/admin-sdk/directory/v1/guides/batch + + """ + for request_batch in _grouper(requests, n=sync_config.batch_size): + # Form batch request. + batch = service.new_batch_http_request() + for request in request_batch: + batch.add(request, callback=_handle_batch_response) + + # Execute the batch request if not in read only mode. Otherwise log that we would + # have. + if not read_only: + LOG.info('Issuing batch request to Google.') + sleep(sync_config.inter_batch_delay) + retries = sync_config.http_retries + while True: + try: + batch.execute() + except HttpError as err: + if (err.resp.status == 503 and retries > 0): + retries -= 1 + LOG.warn('503: Service unavailable - retrying') + sleep(sync_config.http_retry_delay) + continue + if retries == 0: + LOG.error('503: Service unavailable - retry count exceeded') + raise + break + else: + LOG.info('Not issuing batch request in read-only mode.') + + +def _handle_batch_response(request_id, response, exception): + if exception is not None: + LOG.error('Error performing request: %s', exception) + LOG.error('Response: %r', response) + + +def _grouper(iterable, *, n): + """ + Group an iterable into chunks of at most *n* elements. A generator which yields iterables + representing slices of *iterable*. + + >>> [list(i) for i in _grouper('ABCDEFGH', n=3)] + [['A', 'B', 'C'], ['D', 'E', 'F'], ['G', 'H']] + >>> def generator(stop): + ... for x in range(stop): + ... yield x + >>> [list(i) for i in _grouper(generator(10), n=3)] + [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] + >>> [list(i) for i in _grouper(generator(12), n=3)] + [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]] + + The implementation of this function attempts to be efficient; the chunks are iterables which + are generated on demand rather than being constructed first. Hence this function can deal with + iterables which would fill memory if intermediate chunks were stored. + + >>> i = _grouper(generator(100000000000000000000), n=1000000000000000) + >>> next(next(i)) + 0 + + """ + it = iter(iterable) + while True: + next_chunk_it = itertools.islice(it, n) + try: + first = next(next_chunk_it) + except StopIteration: + return + yield itertools.chain((first,), next_chunk_it) diff --git a/gsuitesync/ldap.py b/gsuitesync/ldap.py deleted file mode 100644 index 6a493dacd355b71cb8bee78206adc0ccad3216ff..0000000000000000000000000000000000000000 --- a/gsuitesync/ldap.py +++ /dev/null @@ -1,203 +0,0 @@ -""" -Retrieving user information from an LDAP directory. - -""" -import collections -import dataclasses -import typing - -import ldap3 - -from .config import ConfigurationDataclassMixin - - -# User and group information we need to populate the Google user directory. -UserEntry = collections.namedtuple('UserEntry', 'uid cn sn displayName givenName') -GroupEntry = collections.namedtuple('GroupEntry', 'groupID groupName description uids') - - -@dataclasses.dataclass -class Configuration(ConfigurationDataclassMixin): - """ - Configuration for accessing the LDAP directory. - - """ - host: str - - user_search_base: str - - group_search_base: str - - inst_search_base: str - - eligible_user_filter: str - - eligible_group_filter: str - - eligible_inst_filter: str - - username: str = None - - password: str = None - - managed_user_filter: typing.Union[str, None] = None - - managed_group_filter: typing.Union[str, None] = None - - managed_inst_filter: typing.Union[str, None] = None - - def get_eligible_uids(self): - """ - Return a set containing all uids who are eligible to have a Google account. - - """ - return { - e['attributes']['uid'][0] - for e in self._search( - search_base=self.user_search_base, search_filter=self.eligible_user_filter, - attributes=['uid'] - ) - } - - def get_eligible_groupIDs(self): - """ - Return a set containing all groupIDs that are eligible for Google. - - """ - return { - e['attributes']['groupID'][0] - for e in self._search( - search_base=self.group_search_base, search_filter=self.eligible_group_filter, - attributes=['groupID'] - ) - } - - def get_eligible_instIDs(self): - """ - Return a set containing all instIDs that are eligible for Google. - - """ - return { - e['attributes']['instID'][0] - for e in self._search( - search_base=self.inst_search_base, search_filter=self.eligible_inst_filter, - attributes=['instID'] - ) - } - - def get_managed_user_entries(self): - """ - Return a list containing all managed user entries as UserEntry instances. - - """ - search_filter = ( - self.managed_user_filter - if self.managed_user_filter is not None - else self.eligible_user_filter - ) - return [ - UserEntry( - uid=_extract(e, 'uid'), cn=_extract(e, 'cn'), sn=_extract(e, 'sn'), - displayName=_extract(e, 'displayName'), givenName=_extract(e, 'givenName') - ) - for e in self._search( - search_base=self.user_search_base, search_filter=search_filter, - attributes=['uid', 'cn', 'sn', 'displayName', 'givenName'] - ) - ] - - def get_managed_group_entries(self): - """ - Return a list containing all managed group entries as GroupEntry instances. - - """ - search_filter = ( - self.managed_group_filter - if self.managed_group_filter is not None - else self.eligible_group_filter - ) - return [ - GroupEntry( - groupID=_extract(e, 'groupID'), groupName=_extract(e, 'groupName'), - description=_extract(e, 'description'), uids=set(e['attributes'].get('uid', [])) - ) - for e in self._search( - search_base=self.group_search_base, search_filter=search_filter, - attributes=['groupID', 'groupName', 'description', 'uid'] - ) - ] - - def get_managed_inst_entries(self): - """ - Return a list containing all managed institution entries as GroupEntry instances. - - Note that we return GroupEntry instances here since Lookup institutions become groups in - Google, and this simplifies the sync code by allowing us to handle institutions in the same - way as groups. The GroupEntry's groupID and groupName fields will be the institution's - instID and ou (name) respectively. Since Lookup institutions don't have descriptions, we - set the description field to the institution's name as well (in Google, the description - allows longer strings, and so will not truncate the name). - - """ - # This requires 2 LDAP queries. First find the managed institutions. - search_filter = ( - self.managed_inst_filter - if self.managed_inst_filter is not None - else self.eligible_inst_filter - ) - managed_insts = [ - GroupEntry( - groupID=_extract(e, 'instID'), groupName=_extract(e, 'ou'), - description=_extract(e, 'ou'), uids=set(), - ) - for e in self._search( - search_base=self.inst_search_base, search_filter=search_filter, - attributes=['instID', 'ou'] - ) - ] - managed_insts_by_instID = {g.groupID: g for g in managed_insts} - - # Then get each eligible user's list of institutions and use that data to populate each - # institution's uid list. - eligible_users = self._search( - search_base=self.user_search_base, search_filter=self.eligible_user_filter, - attributes=['uid', 'instID'] - ) - for e in eligible_users: - uid = e['attributes']['uid'][0] - for instID in e['attributes']['instID']: - if instID in managed_insts_by_instID: - managed_insts_by_instID[instID].uids.add(uid) - - return managed_insts - - def _search(self, *, search_base, search_filter, attributes): - # Use SSL to access the LDAP server when authentication credentials - # have been configured - use_ssl = self.username and self.password - ldap_server = ldap3.Server(self.host, use_ssl=use_ssl) - - # Keyword arguments to pass to ldap3.Connection - connection_kwargs = { - 'auto_bind': True - } - - # Add authentication credentials if configured - if self.username: - connection_kwargs['username'] = self.username - if self.password: - connection_kwargs['password'] = self.password - - # Connect to the LDAP server and perform the query - with ldap3.Connection(ldap_server, **connection_kwargs) as conn: - return conn.extend.standard.paged_search( - search_base, search_filter, paged_size=1000, attributes=attributes) - - -def _extract(entry, attr, *, default=''): - vs = entry['attributes'].get(attr, []) - if len(vs) == 0: - return default - if isinstance(vs, str): - return vs - return vs[0] diff --git a/gsuitesync/sync.py b/gsuitesync/sync.py deleted file mode 100644 index d0d3ecb442a333f4af4628dad3e5fd7acbb216ad..0000000000000000000000000000000000000000 --- a/gsuitesync/sync.py +++ /dev/null @@ -1,928 +0,0 @@ -""" -Synchronise Google Directory with a local LDAP directory. - -""" -import crypt -import dataclasses -import itertools -import logging -import numbers -import re -import secrets -import time -import typing - -from googleapiclient import discovery, errors - -from . import config -from . import gapiauth -from . import gapidomain -from . import gapiutil -from . import ldap -from . import limits -from . import naming - -LOG = logging.getLogger(__name__) - -# Scopes required to perform read-only actions. -READ_ONLY_SCOPES = [ - 'https://www.googleapis.com/auth/admin.directory.user.readonly', - 'https://www.googleapis.com/auth/admin.directory.group.readonly', - 'https://www.googleapis.com/auth/admin.directory.group.member.readonly', - 'https://www.googleapis.com/auth/apps.groups.settings' -] - -# Scopes *in addition to READ_ONLY_SCOPES* required to perform a full update. -WRITE_SCOPES = [ - 'https://www.googleapis.com/auth/admin.directory.user', - 'https://www.googleapis.com/auth/admin.directory.group', - 'https://www.googleapis.com/auth/admin.directory.group.member' -] - - -@dataclasses.dataclass -class Configuration(config.ConfigurationDataclassMixin): - # A regular expression which is used to match the organization unit path for Google users who - # should be excluded from the list returned by Google. Those users do not exist for the - # purposes of the rest of the sync and so if they appear in the list of managed users this - # script will attempt to re-add them and fail in the process. Use this setting for users who - # are managed completely outside of this script. - ignore_google_org_unit_path_regex: typing.Union[str, None] = None - - # The organization unit path in which new accounts are placed - new_user_org_unit_path: str = '/' - - # Suffix appended to the names of groups created in Google. The Google group name will be - # "{groupName}{group_name_suffix}", where {groupName} is the Lookup group name. - group_name_suffix: str = ' from lookup.cam.ac.uk' - - # Settings to be applied to groups in Google. These settings are applied to both new and - # existing groups imported from Lookup. - # See https://developers.google.com/admin-sdk/groups-settings/v1/reference/groups#json - group_settings: dict = dataclasses.field(default_factory=lambda: { - 'whoCanJoin': 'INVITED_CAN_JOIN', - 'whoCanViewMembership': 'ALL_IN_DOMAIN_CAN_VIEW', - 'whoCanViewGroup': 'ALL_MEMBERS_CAN_VIEW', - 'whoCanPostMessage': 'ALL_IN_DOMAIN_CAN_POST', - 'allowWebPosting': 'false', - 'messageModerationLevel': 'MODERATE_ALL_MESSAGES', - 'includeInGlobalAddressList': 'true', - 'whoCanLeaveGroup': 'NONE_CAN_LEAVE', - 'whoCanContactOwner': 'ALL_MANAGERS_CAN_CONTACT', - 'whoCanModerateMembers': 'OWNERS_ONLY', - 'whoCanDiscoverGroup': 'ALL_IN_DOMAIN_CAN_DISCOVER', - }) - - # Inter-batch delay in seconds. This is useful to avoid hitting Google rate limits. - inter_batch_delay: numbers.Real = 5 - - # Batch size for Google API calls. Google supports batching requests together into one API - # call. - batch_size: int = 50 - - # Number of times to retry HTTP requests if a HTTP failure response is received - http_retries: int = 5 - - # Delay in seconds between retying a request that has failed - http_retry_delay: numbers.Real = 5 - - -def sync(configuration, *, read_only=True, group_settings=False, just_users=False): - """Perform sync given configuration dictionary.""" - if read_only: - LOG.info('Performing synchronisation in READ ONLY mode.') - else: - LOG.info('Performing synchronisation in WRITE mode.') - - # Parse configuration - sync_config = Configuration.from_dict(configuration.get('sync', {})) - gapi_auth_config = gapiauth.Configuration.from_dict( - configuration.get('google_api', {}).get('auth', {})) - gapi_domain_config = gapidomain.Configuration.from_dict( - configuration.get('google_domain', {})) - ldap_config = ldap.Configuration.from_dict(configuration.get('ldap', {})) - limits_config = limits.Configuration.from_dict(configuration.get('limits', {})) - - # Load appropriate Google credentials. - creds = ( - gapi_auth_config.load_credentials(read_only=read_only) - .with_scopes(READ_ONLY_SCOPES + ([] if read_only else WRITE_SCOPES)) - ) - # Use admin_user if using service account with Domain-Wide Delegation - if gapi_domain_config.admin_user: - creds = creds.with_subject(gapi_domain_config.admin_user) - - # Secondary domain for Google groups that come from Lookup groups - groups_domain = ( - gapi_domain_config.groups_domain - if gapi_domain_config.groups_domain is not None - else gapi_domain_config.name - ) - - # Secondary domain for Google groups that come from Lookup institutions - insts_domain = ( - gapi_domain_config.insts_domain - if gapi_domain_config.insts_domain is not None - else gapi_domain_config.name - ) - - # Functions to translate the unique identifiers of users, groups and institutions in Lookup - # (uids, groupIDs and instIDs) to and from the unique identifiers used in Google (email - # addresses). - # - # For users: {uid} <-> {uid}@{domain} - # For groups: {groupID} <-> {groupID}@{groups_domain} - # For insts: {instID} <-> {instID.lower()}@{insts_domain} (local part must be lowercase) - # - # Additionally, valid uids (CRSids) match the regex [a-z][a-z0-9]{3,7}, valid groupIDs match - # the regex [0-9]{6,8} and valid instIDs match the regex [A-Z][A-Z0-9]+. - # - # Since Lookup institutions become groups in Google, we use common code to sync all Google - # groups, regardless of whether they were groups or institutions in Lookup. In all the code - # that follows, we use "gid" to refer to the unique identifier of the group or institution in - # Lookup (i.e., gid may be either a Lookup groupID or instID). - user_email_regex = re.compile('^[a-z][a-z0-9]{3,7}@.*$') - groupID_regex = re.compile('^[0-9]{6,8}$') - instID_regex = re.compile('^[A-Z][A-Z0-9]+$') - - def uid_to_email(uid): - return f'{uid}@{gapi_domain_config.name}' - - def email_to_uid(email): - return email.split('@')[0] if user_email_regex.match(email) else None - - def gid_to_email(gid): - return ( - f'{gid}@{groups_domain}' if groupID_regex.match(gid) else - f'{gid.lower()}@{insts_domain}' if instID_regex.match(gid) else None - ) - - def email_to_gid(email): - gid = email.split('@')[0] - return ( - gid if groupID_regex.match(gid) else - gid.upper() if instID_regex.match(gid.upper()) else None - ) - - # -------------------------------------------------------------------------------------------- - # Load current user, group and institution data from Lookup. - # -------------------------------------------------------------------------------------------- - - # Get a set containing all CRSids. These are all the people who are eligible to be in our - # GSuite instance. If a user is in GSuite and is *not* present in this list then they are - # suspended. - LOG.info('Reading eligible user entries from LDAP') - eligible_uids = ldap_config.get_eligible_uids() - LOG.info('Total LDAP user entries: %s', len(eligible_uids)) - - # Sanity check: there are some eligible users (else LDAP lookup failure?) - if len(eligible_uids) == 0: - raise RuntimeError('Sanity check failed: no users in eligible set') - - if just_users: - eligible_gids = set() - else: - # Get a set containing all groupIDs. These are all the groups that are eligible to be in - # our GSuite instance. If a group is in GSuite and is *not* present in this list then it - # is deleted. - LOG.info('Reading eligible group entries from LDAP') - eligible_groupIDs = ldap_config.get_eligible_groupIDs() - LOG.info('Total LDAP group entries: %s', len(eligible_groupIDs)) - - # Get a set containing all instIDs. These are all the institutions that are eligible to be - # in our GSuite instance. If an institution is in GSuite and is *not* present in this list - # then the corresponding group is deleted. - LOG.info('Reading eligible institution entries from LDAP') - eligible_instIDs = ldap_config.get_eligible_instIDs() - LOG.info('Total LDAP institution entries: %s', len(eligible_instIDs)) - - # Add these sets together to form the set of all gids (the IDs of all eligible groups and - # institutions). - eligible_gids = eligible_groupIDs | eligible_instIDs - LOG.info('Total combined LDAP group and institution entries: %s', len(eligible_gids)) - - # Get a list of managed users. These are all the people who match the "managed_user_filter" in - # the LDAP settings. - LOG.info('Reading managed user entries from LDAP') - managed_user_entries = ldap_config.get_managed_user_entries() - - # Form a mapping from uid to managed user. - managed_user_entries_by_uid = {u.uid: u for u in managed_user_entries} - - # Form a set of all *managed user* uids - managed_user_uids = set(managed_user_entries_by_uid.keys()) - LOG.info('Total managed user entries: %s', len(managed_user_uids)) - - # Sanity check: the managed users should be a subset of the eligible ones. - if len(managed_user_uids - eligible_uids) != 0: - raise RuntimeError('Sanity check failed: some managed uids were not in the eligible set') - - if just_users: - managed_group_entries = [] - managed_group_entries_by_gid = dict() - else: - # Get a list of managed groups. These are all the groups that match the - # "managed_group_filter" in the LDAP settings. - LOG.info('Reading managed group entries from LDAP') - managed_group_entries = ldap_config.get_managed_group_entries() - - # Form a mapping from groupID to managed group. - managed_group_entries_by_groupID = {g.groupID: g for g in managed_group_entries} - - # Form a set of all *managed group* groupIDs - managed_group_groupIDs = set(managed_group_entries_by_groupID.keys()) - LOG.info('Total managed group entries: %s', len(managed_group_groupIDs)) - LOG.info( - 'Total managed group members: %s', - sum([len(g.uids) for g in managed_group_entries]) - ) - - # Get a list of managed institutions. These are all the institutions that match the - # "managed_inst_filter" in the LDAP settings. - LOG.info('Reading managed institution entries from LDAP') - managed_inst_entries = ldap_config.get_managed_inst_entries() - - # Form a mapping from instID to managed institution. - managed_inst_entries_by_instID = {i.groupID: i for i in managed_inst_entries} - - # Form a set of all *managed institution* instIDs - managed_inst_instIDs = set(managed_inst_entries_by_instID.keys()) - LOG.info('Total managed institution entries: %s', len(managed_inst_instIDs)) - LOG.info( - 'Total managed institution members: %s', - sum([len(i.uids) for i in managed_inst_entries]) - ) - - # Add the collections of managed institutions to the collections of managed groups. - managed_group_entries += managed_inst_entries - managed_group_entries_by_gid = { - **managed_group_entries_by_groupID, **managed_inst_entries_by_instID - } - managed_group_gids = managed_group_groupIDs | eligible_instIDs - LOG.info( - 'Total combined managed group and institution entries: %s', len(managed_group_gids) - ) - LOG.info( - 'Total combined managed group and institution members: %s', - sum([len(g.uids) for g in managed_group_entries]) - ) - - # Sanity check: the managed groups should be a subset of the eligible ones. - if len(managed_group_gids - eligible_gids) != 0: - raise RuntimeError( - 'Sanity check failed: some managed gids were not in the eligible set' - ) - - # -------------------------------------------------------------------------------------------- - # Load current user, group and institution data from Google. - # -------------------------------------------------------------------------------------------- - - # Build the directory service using Google API discovery. - directory_service = discovery.build('admin', 'directory_v1', credentials=creds) - - # Also build the groupssettings service, which is a parallel API to manage group settings - groupssettings_service = discovery.build('groupssettings', 'v1', credentials=creds) - - # Retrieve information on all users excluding domain admins. - LOG.info('Getting information on Google domain users') - fields = [ - 'id', 'isAdmin', 'orgUnitPath', 'primaryEmail', 'suspended', 'suspensionReason', - 'name(givenName, familyName)', - ] - all_google_users = gapiutil.list_all( - directory_service.users().list, items_key='users', domain=gapi_domain_config.name, - query='isAdmin=false', fields='nextPageToken,users(' + ','.join(fields) + ')', - retries=sync_config.http_retries, retry_delay=sync_config.http_retry_delay, - ) - - # Function to fetch Google group information from the specified domain - def fetch_groups(domain): - fields = ['id', 'email', 'name', 'description'] - return gapiutil.list_all( - directory_service.groups().list, items_key='groups', domain=domain, - fields='nextPageToken,groups(' + ','.join(fields) + ')', - retries=sync_config.http_retries, retry_delay=sync_config.http_retry_delay, - ) - - if just_users: - # pretend there are no google groups - all_google_groups = [] - else: - # Retrieve information on all Google groups that come from Lookup groups - LOG.info('Getting information on Google domain groups') - all_google_groups = [ - g for g in fetch_groups(groups_domain) - if groupID_regex.match(g['email'].split('@')[0]) - ] - - # Append information on all Google groups that come from Lookup institutions - LOG.info('Getting information on Google domain institutions') - all_google_groups.extend([ - g for g in fetch_groups(insts_domain) - if instID_regex.match(g['email'].split('@')[0].upper()) - ]) - - # Strip any "to be ignored" users out of the results. - if sync_config.ignore_google_org_unit_path_regex is not None: - LOG.info( - 'Ignoring users whose organization unit path matches %r', - sync_config.ignore_google_org_unit_path_regex) - # Check that all users have an orgUnitPath - missing_org = [ - u for u in all_google_users if 'orgUnitPath' not in u - ] - if len(missing_org) != 0: - LOG.error('User entries missing orgUnitPath: %s (starting with %s)', - len(missing_org), - missing_org[0]['primaryEmail'] if 'primaryEmail' in missing_org[0] - else 'user with blank email') - raise RuntimeError('Sanity check failed: at least one user is missing orgUnitPath') - # Remove users matching regex - regex = re.compile(sync_config.ignore_google_org_unit_path_regex) - all_google_users = [ - u for u in all_google_users if not regex.match(u['orgUnitPath']) - ] - - # Strip out any users with uids (extracted from the local-part of the email address) that - # aren't valid CRSids. These users can't have come from Lookup, and so should not be managed - # (suspended) by this script. - all_google_users = [u for u in all_google_users if email_to_uid(u['primaryEmail'])] - - # Strip out any groups whose email addresses don't match the pattern for groups created - # from Lookup groupIDs or instIDs, and which therefore should not be managed (deleted) by - # this script. - all_google_groups = [g for g in all_google_groups if email_to_gid(g['email'])] - - # Sanity check. There should be no admins in the returned results. - if any(u.get('isAdmin', False) for u in all_google_users): - raise RuntimeError('Sanity check failed: admin users in user list') - - # Form mappings from uid/gid to Google user/group. - all_google_users_by_uid = {email_to_uid(u['primaryEmail']): u for u in all_google_users} - all_google_groups_by_gid = {email_to_gid(g['email']): g for g in all_google_groups} - - # Form sets of all Google-side uids and gids. The all_google_uids set is all users including - # the suspended ones and the suspended_google_uids set is only the suspended users. Non - # suspended users are therefore all_google_uids - suspended_google_uids. The all_google_gids - # set includes both groupIDs and instIDs. Groups in Google do not have any concept of being - # suspended. - all_google_uids = set(all_google_users_by_uid.keys()) - all_google_gids = set(all_google_groups_by_gid.keys()) - suspended_google_uids = {uid for uid, u in all_google_users_by_uid.items() if u['suspended']} - - # Sanity check. We should not have lost anything. (I.e. the uids and gids should be unique.) - if len(all_google_uids) != len(all_google_users): - raise RuntimeError('Sanity check failed: user list changed length') - if len(all_google_gids) != len(all_google_groups): - raise RuntimeError('Sanity check failed: group list changed length') - - if group_settings and not just_users: - # Retrieve all Google group settings. - fields = ['email', *[k for k in sync_config.group_settings.keys()]] - all_google_group_settings = gapiutil.get_all_in_list( - groupssettings_service, groupssettings_service.groups().get, - item_ids=[g['email'] for g in all_google_groups], id_key='groupUniqueId', - batch_size=sync_config.batch_size, fields=','.join(fields), - retries=sync_config.http_retries, retry_delay=sync_config.http_retry_delay, - ) - - # Form a mapping from gid to Google group settings. - all_google_group_settings_by_gid = { - email_to_gid(g['email']): g for g in all_google_group_settings - } - - # Sanity check. We should have settings for each managed group. - if len(all_google_group_settings_by_gid) != len(all_google_groups): - raise RuntimeError( - 'Sanity check failed: group settings list does not match group list' - ) - - # Retrieve all Google group memberships. This is a mapping from internal Google group ids to - # lists of member resources, corresponding to both Lookup groups and institutions. - if just_users: - all_google_members = dict() - else: - fields = ['id', 'email'] - all_google_members = gapiutil.list_all_in_list( - directory_service, directory_service.members().list, - item_ids=[g['id'] for g in all_google_groups], id_key='groupKey', - batch_size=sync_config.batch_size, items_key='members', - fields='nextPageToken,members(' + ','.join(fields) + ')', - retries=sync_config.http_retries, retry_delay=sync_config.http_retry_delay, - ) - - # Santiy check. We should have a group members list for each managed group. - if len(all_google_members) != len(all_google_groups): - raise RuntimeError( - 'Sanity check failed: groups in members map do not match group list') - - # Log some stats. - LOG.info('Total Google users: %s', len(all_google_uids)) - LOG.info( - 'Suspended Google users: %s', sum(1 if u['suspended'] else 0 for u in all_google_users)) - if not just_users: - LOG.info('Total Google groups: %s', len(all_google_gids)) - LOG.info( - 'Total Google group members: %s', sum([len(m) for g, m in all_google_members.items()]) - ) - - # -------------------------------------------------------------------------------------------- - # Compute differences between the Lookup and Google data. - # -------------------------------------------------------------------------------------------- - - # For each user which exists in Google or the managed user set which is eligible, determine if - # they need updating/creating. If so, record a patch/insert for the user. - LOG.info('Calculating updates...') - google_user_updates = {} - google_user_creations = {} - for uid, managed_user_entry in managed_user_entries_by_uid.items(): - # Heuristically determine the given and family names. - names = naming.get_names( - uid=uid, display_name=managed_user_entry.displayName, cn=managed_user_entry.cn, - sn=managed_user_entry.sn, given_name=managed_user_entry.givenName) - - # Form expected user resource fields. - expected_google_user = { - 'name': { - 'givenName': names.given_name, - 'familyName': names.family_name, - }, - } - - # Find existing Google user (if any). - existing_google_user = all_google_users_by_uid.get(uid) - - if existing_google_user is not None: - # See if we need to change the existing user - # Unless anything needs changing, the patch is empty. - patch = {} - - # Determine how to patch user's name. - google_user_name = existing_google_user.get('name', {}) - patch_name = {} - if google_user_name.get('givenName') != expected_google_user['name']['givenName']: - patch_name['givenName'] = names.given_name - if google_user_name.get('familyName') != expected_google_user['name']['familyName']: - patch_name['familyName'] = names.family_name - if len(patch_name) > 0: - patch['name'] = patch_name - - # Only record non-empty patches. - if len(patch) > 0: - google_user_updates[uid] = patch - else: - # No existing Google user. Record the new resource. Generate a new user password and - # send Google the hash. It doesn't matter what this password is since we never have the - # user log in with it. For password-only applications the user can make use of an - # application-specific password. - new_user = { - 'primaryEmail': uid_to_email(uid), - **expected_google_user, - } - google_user_creations[uid] = new_user - - # For each group which exists in Google or the managed group set which is eligible, determine - # if it needs updating/creating. If so, record a patch/insert for the group. - google_group_updates = {} - google_group_creations = {} - for gid, managed_group_entry in managed_group_entries_by_gid.items(): - # Form expected group resource fields. The 2 Google APIs we use here to update groups in - # Google each have different maximum lengths for group names and descriptions, and - # empirically the APIs don't function properly if either limit is exceeded, so we use the - # minimum of the 2 documented maximum field lengths (73 characters for names and 300 - # characters for descriptions). - # - # Note that the source of each of these groups may be either a Lookup group or a Lookup - # institution, which are handled the same here. Technically Lookup institutions do not have - # descriptions, but the code in ldap.py sets the description from the name for Lookup - # institutions, which is useful since some institution names do not fit in the Google name - # field. - expected_google_group = { - 'name': _trim_text( - managed_group_entry.groupName, maxlen=73, suffix=sync_config.group_name_suffix - ), - 'description': _trim_text( - _clean_group_desc(managed_group_entry.description), - maxlen=300 - ) - } - - # Find existing Google group (if any). - existing_google_group = all_google_groups_by_gid.get(gid) - - if existing_google_group is not None: - # See if we need to change the existing group - # Unless anything needs changing, the patch is empty. - patch = {} - - if existing_google_group.get('name') != expected_google_group['name']: - patch['name'] = expected_google_group['name'] - if existing_google_group.get('description') != expected_google_group['description']: - patch['description'] = expected_google_group['description'] - - # Only record non-empty patches. - if len(patch) > 0: - google_group_updates[gid] = patch - else: - # No existing Google group, so create one. - google_group_creations[gid] = { - 'email': gid_to_email(gid), - **expected_google_group - } - - # Form a set of all the uids which need patching. - uids_to_update = set(google_user_updates.keys()) - LOG.info('Number of existing users to update: %s', len(uids_to_update)) - - # Form a set of all the gids which need patching. - gids_to_update = set(google_group_updates.keys()) - LOG.info('Number of existing groups to update: %s', len(gids_to_update)) - - # Form a set of all the uids which need adding. - uids_to_add = set(google_user_creations.keys()) - LOG.info('Number of users to add: %s', len(uids_to_add)) - - # Form a set of all the gids which need adding. - gids_to_add = set(google_group_creations.keys()) - LOG.info('Number of groups to add: %s', len(gids_to_add)) - - # Form a set of all uids which need reactivating. We reactive users who are in the managed user - # list *and* the suspended user list. - uids_to_reactivate = suspended_google_uids & managed_user_uids - LOG.info('Number of users to reactivate: %s', len(uids_to_reactivate)) - - # Form a set of all uids which should be suspended. This is all the unsuspended Google uids - # which do not appear in our eligible user list. - uids_to_suspend = (all_google_uids - suspended_google_uids) - eligible_uids - LOG.info('Number of users to suspend: %s', len(uids_to_suspend)) - - # Form a set of all gids which need deleting. - gids_to_delete = all_google_gids - eligible_gids - LOG.info('Number of groups to delete: %s', len(gids_to_delete)) - - # For each managed group, determine which members to insert or delete. These are lists of - # (gid, uid) tuples. - members_to_insert = [] - members_to_delete = [] - for gid, managed_group_entry in managed_group_entries_by_gid.items(): - # Find the existing Google group members. - existing_google_group = all_google_groups_by_gid.get(gid) - if existing_google_group: - existing_members = all_google_members[existing_google_group['id']] - existing_member_uids = set([email_to_uid(m['email']) for m in existing_members]) - else: - existing_member_uids = set() - - # Members to insert. This is restricted to the managed user set, so that we don't attempt - # to insert a member resource for a non-existent user. - insert_uids = ( - (managed_group_entry.uids - existing_member_uids).intersection(managed_user_uids) - ) - members_to_insert.extend([(gid, uid) for uid in insert_uids]) - - # Members to delete. This is restricted to the eligible user set, so that we don't bother - # to delete a member resource when the user is suspended (and so we won't need to re-add - # it if the user is reactivated). - delete_uids = ( - (existing_member_uids - managed_group_entry.uids).intersection(eligible_uids) - ) - members_to_delete.extend([(gid, uid) for uid in delete_uids]) - - LOG.info('Number of group members to insert: %s', len(members_to_insert)) - LOG.info('Number of group members to delete: %s', len(members_to_delete)) - - # -------------------------------------------------------------------------------------------- - # Enforce limits on how much data to change in Google. - # -------------------------------------------------------------------------------------------- - - # Calculate percentage change to users, groups and group members. - user_change_percentage = 100. * ( - len(uids_to_add | uids_to_update | uids_to_reactivate | uids_to_suspend) - / - max(1, len(all_google_uids)) - ) - LOG.info('Configuration will modify %.2f%% of users', user_change_percentage) - - group_change_percentage = 100. * ( - len(gids_to_add | gids_to_update | gids_to_delete) - / - max(1, len(all_google_gids)) - ) - LOG.info('Configuration will modify %.2f%% of groups', group_change_percentage) - - member_change_percentage = 100. * ( - (len(members_to_insert) + len(members_to_delete)) - / - max(1, sum([len(m) for g, m in all_google_members.items()])) - ) - LOG.info('Configuration will modify %.2f%% of group members', member_change_percentage) - - # Enforce percentage change sanity checks. - if (limits_config.abort_user_change_percentage is not None and - user_change_percentage > limits_config.abort_user_change_percentage): - LOG.error( - 'Modification of %.2f%% of users is greater than limit of %.2f%%. Aborting.', - user_change_percentage, limits_config.abort_user_change_percentage - ) - raise RuntimeError('Aborting due to large user change percentage') - if (limits_config.abort_group_change_percentage is not None and - group_change_percentage > limits_config.abort_group_change_percentage): - LOG.error( - 'Modification of %.2f%% of groups is greater than limit of %.2f%%. Aborting.', - group_change_percentage, limits_config.abort_group_change_percentage - ) - raise RuntimeError('Aborting due to large group change percentage') - if (limits_config.abort_member_change_percentage is not None and - member_change_percentage > limits_config.abort_member_change_percentage): - LOG.error( - 'Modification of %.2f%% of group members is greater than limit of %.2f%%. Aborting.', - member_change_percentage, limits_config.abort_member_change_percentage - ) - raise RuntimeError('Aborting due to large group member change percentage') - - # Cap maximum size of various operations. - if limits_config.max_new_users is not None and len(uids_to_add) > limits_config.max_new_users: - # Ensure that we do not attempt to insert a group member for any of the users not added as - # a result of this cap, since these users won't exist in Google - capped_uids_to_add = _limit(uids_to_add, limits_config.max_new_users) - uids_not_added = uids_to_add - capped_uids_to_add - members_to_insert = [(g, u) for g, u in members_to_insert if u not in uids_not_added] - uids_to_add = capped_uids_to_add - LOG.info('Capped number of new users to %s', len(uids_to_add)) - if (limits_config.max_new_groups is not None and - len(gids_to_add) > limits_config.max_new_groups): - # Ensure that we do not attempt to insert a group member for any of the groups not added - # as a result of this cap, since these groups won't exist in Google - capped_gids_to_add = _limit(gids_to_add, limits_config.max_new_groups) - gids_not_added = gids_to_add - capped_gids_to_add - members_to_insert = [(g, u) for g, u in members_to_insert if g not in gids_not_added] - gids_to_add = capped_gids_to_add - LOG.info('Capped number of new groups to %s', len(gids_to_add)) - if (limits_config.max_suspended_users is not None and - len(uids_to_suspend) > limits_config.max_suspended_users): - uids_to_suspend = _limit(uids_to_suspend, limits_config.max_suspended_users) - LOG.info('Capped number of users to suspend to %s', len(uids_to_suspend)) - if (limits_config.max_deleted_groups is not None and - len(gids_to_delete) > limits_config.max_deleted_groups): - gids_to_delete = _limit(gids_to_delete, limits_config.max_deleted_groups) - LOG.info('Capped number of groups to delete to %s', len(gids_to_delete)) - if (limits_config.max_reactivated_users is not None and - len(uids_to_reactivate) > limits_config.max_reactivated_users): - uids_to_reactivate = _limit(uids_to_reactivate, limits_config.max_reactivated_users) - LOG.info('Capped number of users to reactivate to %s', len(uids_to_reactivate)) - if (limits_config.max_updated_users is not None and - len(uids_to_update) > limits_config.max_updated_users): - uids_to_update = _limit(uids_to_update, limits_config.max_updated_users) - LOG.info('Capped number of users to update to %s', len(uids_to_update)) - if (limits_config.max_updated_groups is not None and - len(gids_to_update) > limits_config.max_updated_groups): - gids_to_update = _limit(gids_to_update, limits_config.max_updated_groups) - LOG.info('Capped number of groups to update to %s', len(gids_to_update)) - if (limits_config.max_inserted_members is not None and - len(members_to_insert) > limits_config.max_inserted_members): - members_to_insert = members_to_insert[0:limits_config.max_inserted_members] - LOG.info('Capped number of group members to insert to %s', len(members_to_insert)) - if (limits_config.max_deleted_members is not None and - len(members_to_delete) > limits_config.max_deleted_members): - members_to_delete = members_to_delete[0:limits_config.max_deleted_members] - LOG.info('Capped number of group members to delete to %s', len(members_to_delete)) - - # -------------------------------------------------------------------------------------------- - # Finally, perform the actual updates in Google. - # -------------------------------------------------------------------------------------------- - - # A generator which will generate patch() and insert() calls to the directory service to - # perform the actions required to update users - def user_api_requests(): - # Update existing users. - user_updates = {uid: google_user_updates[uid] for uid in uids_to_update} - for uid, update in user_updates.items(): - google_id = all_google_users_by_uid[uid]['id'] - # Only show the previous parts of name that have been changed - updated_google_user_name = update.get('name', {}) - previous_google_user_name = all_google_users_by_uid[uid].get('name', {}) - previous = { - k: previous_google_user_name.get(k, '') - for k in ['givenName', 'familyName'] - if k in updated_google_user_name - } - LOG.info('Update user "%s": "%r" from "%r"', uid, update, previous) - yield directory_service.users().patch(userKey=google_id, body=update) - - # Suspend old users - for uid in uids_to_suspend: - google_id = all_google_users_by_uid[uid]['id'] - LOG.info('Suspending user: "%s"', uid) - yield directory_service.users().patch(userKey=google_id, body={'suspended': True}) - - # Reactivate returning users - for uid in uids_to_reactivate: - google_id = all_google_users_by_uid[uid]['id'] - LOG.info('Reactivating user: "%s"', uid) - yield directory_service.users().patch(userKey=google_id, body={'suspended': False}) - - # Create new users - for uid in uids_to_add: - # Generate a random password which is thrown away. - new_user = {**{ - 'hashFunction': 'crypt', - 'password': crypt.crypt(secrets.token_urlsafe(), crypt.METHOD_SHA512), - 'orgUnitPath': sync_config.new_user_org_unit_path, - }, **google_user_creations[uid]} - redacted_user = {**new_user, **{'password': 'REDACTED'}} - LOG.info('Adding user "%s": %s', uid, redacted_user) - yield directory_service.users().insert(body=new_user) - - # A generator which will generate patch(), insert() and delete() calls to the directory - # service to perform the actions required to update groups - def group_api_requests(): - # Update existing groups - group_updates = {gid: google_group_updates[gid] for gid in gids_to_update} - for gid, update in group_updates.items(): - google_id = all_google_groups_by_gid[gid]['id'] - LOG.info('Update group "%s": "%r"', gid, update) - yield directory_service.groups().patch(groupKey=google_id, body=update) - - # Delete cancelled groups - for gid in gids_to_delete: - google_id = all_google_groups_by_gid[gid]['id'] - LOG.info('Deleting group: "%s"', gid) - yield directory_service.groups().delete(groupKey=google_id) - - # Create new groups - for gid in gids_to_add: - new_group = google_group_creations[gid] - LOG.info('Adding group "%s": %s', gid, new_group) - yield directory_service.groups().insert(body=new_group) - - # A generator which will generate patch() calls to the groupssettings service to set or - # update the required group settings. - def group_settings_api_requests(): - # Apply all settings to new groups. - for gid in gids_to_add: - email = gid_to_email(gid) - settings = sync_config.group_settings - LOG.info('Updating settings for new group "%s": %s', gid, settings) - yield groupssettings_service.groups().patch(groupUniqueId=email, body=settings) - - if group_settings: - # Apply any settings that differ to pre-existing groups. - for gid, settings in all_google_group_settings_by_gid.items(): - patch = {k: v for k, v in sync_config.group_settings.items() - if settings.get(k) != v} - if patch: - email = gid_to_email(gid) - LOG.info('Updating settings for existing group "%s": %s', gid, patch) - yield groupssettings_service.groups().patch(groupUniqueId=email, body=patch) - else: - LOG.info('Skipping updating settings for existing groups') - - # A generator which will generate insert() and delete() calls to the directory service to - # perform the actions required to update group members - def member_api_requests(): - # Insert new members - for gid, uid in members_to_insert: - group_key = gid_to_email(gid) - user_key = uid_to_email(uid) - LOG.info('Adding user "%s" to group "%s"', user_key, group_key) - yield directory_service.members().insert(groupKey=group_key, body={'email': user_key}) - - # Delete removed members - for gid, uid in members_to_delete: - group_key = gid_to_email(gid) - user_key = uid_to_email(uid) - LOG.info('Removing user "%s" from group "%s"', user_key, group_key) - yield directory_service.members().delete(groupKey=group_key, memberKey=user_key) - - # Process an iterable list of requests to the specified Google service in batches. These APIs - # support a maximum batch size of 1000. See: - # https://developers.google.com/admin-sdk/directory/v1/guides/batch - def process_requests(service, requests): - for request_batch in _grouper(requests, n=sync_config.batch_size): - # Form batch request. - batch = service.new_batch_http_request() - for request in request_batch: - batch.add(request, callback=_handle_batch_response) - - # Execute the batch request if not in read only mode. Otherwise log that we would - # have. - if not read_only: - LOG.info('Issuing batch request to Google.') - time.sleep(sync_config.inter_batch_delay) - retries = sync_config.http_retries - while True: - try: - batch.execute() - except errors.HttpError as err: - if (err.resp.status == 503 and retries > 0): - retries -= 1 - LOG.warn('503: Service unavailable - retrying') - time.sleep(sync_config.http_retry_delay) - continue - if retries == 0: - LOG.error('503: Service unavailable - retry count exceeded') - raise - break - else: - LOG.info('Not issuing batch request in read-only mode.') - - # Process all the user, group and group member updates - process_requests(directory_service, user_api_requests()) - if not just_users: - process_requests(directory_service, group_api_requests()) - process_requests(groupssettings_service, group_settings_api_requests()) - process_requests(directory_service, member_api_requests()) - - -def _handle_batch_response(request_id, response, exception): - if exception is not None: - LOG.error('Error performing request: %s', exception) - LOG.error('Response: %r', response) - - -def _limit(s, limit): - """ - Given a set, s, and a numeric limit, return a set which has no more than *limit* elements. The - exact set of elements retained is not specified. - - >>> s = set('ABCDEFGHIJKLMNOPQ') - >>> len(s) > 5 - True - >>> len(_limit(s, 5)) == 5 - True - >>> len(_limit(s, 500)) == len(s) - True - - All elements of the returned set are taken from input set. - - >>> s_prime = _limit(s, 5) - >>> s_prime - s - set() - - """ - return {e for _, e in itertools.takewhile(lambda p: p[0] < limit, enumerate(s))} - - -def _grouper(iterable, *, n): - """ - Group an iterable into chunks of at most *n* elements. A generator which yields iterables - representing slices of *iterable*. - - >>> [list(i) for i in _grouper('ABCDEFGH', n=3)] - [['A', 'B', 'C'], ['D', 'E', 'F'], ['G', 'H']] - >>> def generator(stop): - ... for x in range(stop): - ... yield x - >>> [list(i) for i in _grouper(generator(10), n=3)] - [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] - >>> [list(i) for i in _grouper(generator(12), n=3)] - [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]] - - The implementation of this function attempts to be efficient; the chunks are iterables which - are generated on demand rather than being constructed first. Hence this function can deal with - iterables which would fill memory if intermediate chunks were stored. - - >>> i = _grouper(generator(100000000000000000000), n=1000000000000000) - >>> next(next(i)) - 0 - - """ - it = iter(iterable) - while True: - next_chunk_it = itertools.islice(it, n) - try: - first = next(next_chunk_it) - except StopIteration: - return - yield itertools.chain((first,), next_chunk_it) - - -def _trim_text(text, *, maxlen, cont='...', suffix=''): - """ - Trim text to be no more than "maxlen" characters long, terminating it with "cont" if it had - to be truncated. If supplied, "suffix" is appended to the string after truncating, and the - truncation point adjusted so that the total length remains less than "maxlen". - - """ - return ( - text[0:maxlen-len(cont)-len(suffix)]+cont+suffix - if len(text)+len(suffix) > maxlen else text+suffix - ) - - -def _clean_group_desc(s): - """ - Clean any "bad characters" in group descriptions. - - Google support (https://support.google.com/a/answer/9193374) says: - "descriptions can’t contain equal signs (=), or brackets (<,>)" - - >>> _clean_group_desc('a<b>c=d') - 'abcd' - - """ - return ''.join(c for c in s if c not in _CLEAN_GROUP_DESC_BAD_CHARS) - - -# Characters stripped by _clean_group_desc. Present as a constant to avoid re-creating it. -_CLEAN_GROUP_DESC_BAD_CHARS = '=<>' diff --git a/gsuitesync/sync/__init__.py b/gsuitesync/sync/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e702854c2df0229d0353d13e7314fb60e9329a15 --- /dev/null +++ b/gsuitesync/sync/__init__.py @@ -0,0 +1 @@ +from .main import sync # noqa: F401 diff --git a/gsuitesync/sync/base.py b/gsuitesync/sync/base.py new file mode 100644 index 0000000000000000000000000000000000000000..ba6cd664157f56aabf0a2df924339a590611315f --- /dev/null +++ b/gsuitesync/sync/base.py @@ -0,0 +1,14 @@ +""" +Base classes for retrievers, comparator and updater classes that consume configuration and state. + +""" + + +class ConfigurationStateConsumer: + required_config = None + + def __init__(self, configuration, state): + # For convenience, create properties for required configuration + for c in (self.required_config if self.required_config is not None else []): + setattr(self, f'{c}_config', configuration.get(c, {})) + self.state = state diff --git a/gsuitesync/sync/compare.py b/gsuitesync/sync/compare.py new file mode 100644 index 0000000000000000000000000000000000000000..02f62a533bc93c71b2bc7c4239fc687455bf8c1d --- /dev/null +++ b/gsuitesync/sync/compare.py @@ -0,0 +1,417 @@ +""" +Compute differences between the Lookup and Google data. + +""" +import logging +import itertools + +from .. import naming +from .base import ConfigurationStateConsumer +from .utils import uid_to_email, gid_to_email, email_to_uid + +LOG = logging.getLogger(__name__) + + +class Comparator(ConfigurationStateConsumer): + required_config = ('gapi_domain', 'sync', 'limits') + + def compare_users(self): + # For each user which exists in Google or the managed user set which is eligible, + # determine if they need updating/creating. If so, record a patch/insert for the user. + LOG.info('Calculating updates to users...') + google_user_updates = {} + google_user_creations = {} + for uid, managed_user_entry in self.state.managed_user_entries_by_uid.items(): + # Heuristically determine the given and family names. + names = naming.get_names( + uid=uid, display_name=managed_user_entry.displayName, cn=managed_user_entry.cn, + sn=managed_user_entry.sn, given_name=managed_user_entry.givenName) + + # Form expected user resource fields. + expected_google_user = { + 'name': { + 'givenName': names.given_name, + 'familyName': names.family_name, + }, + } + + # Find existing Google user (if any). + existing_google_user = self.state.all_google_users_by_uid.get(uid) + + if existing_google_user is not None: + # See if we need to change the existing user + # Unless anything needs changing, the patch is empty. + patch = {} + + # Determine how to patch user's name. + google_name = existing_google_user.get('name', {}) + patch_name = {} + if google_name.get('givenName') != expected_google_user['name']['givenName']: + patch_name['givenName'] = names.given_name + if google_name.get('familyName') != expected_google_user['name']['familyName']: + patch_name['familyName'] = names.family_name + if len(patch_name) > 0: + patch['name'] = patch_name + + # Only record non-empty patches. + if len(patch) > 0: + google_user_updates[uid] = patch + else: + # No existing Google user. Record the new resource. Generate a new user password + # and send Google the hash. It doesn't matter what this password is since we never + # have the user log in with it. For password-only applications the user can make + # use of an application-specific password. + new_user = { + 'primaryEmail': uid_to_email(uid, self.gapi_domain_config.name), + **expected_google_user, + } + google_user_creations[uid] = new_user + + # Form a set of all the uids which need patching. + uids_to_update = set(google_user_updates.keys()) + LOG.info('Number of existing users to update: %s', len(uids_to_update)) + + # Form a set of all the uids which need adding. + uids_to_add = set(google_user_creations.keys()) + LOG.info('Number of users to add: %s', len(uids_to_add)) + + # Form a set of all uids which need reactivating. We reactive users who are in the managed + # user list *and* the suspended user list. + uids_to_reactivate = self.state.suspended_google_uids & self.state.managed_user_uids + LOG.info('Number of users to reactivate: %s', len(uids_to_reactivate)) + + # Form a set of all uids which should be suspended. This is all the unsuspended Google uids + # which do not appear in our eligible user list. + uids_to_suspend = ( + (self.state.all_google_uids - self.state.suspended_google_uids) + - self.state.eligible_uids + ) + LOG.info('Number of users to suspend: %s', len(uids_to_suspend)) + + self.state.update({ + 'google_user_updates': google_user_updates, + 'google_user_creations': google_user_creations, + 'uids_to_update': uids_to_update, + 'uids_to_add': uids_to_add, + 'uids_to_reactivate': uids_to_reactivate, + 'uids_to_suspend': uids_to_suspend, + }) + + def compare_groups(self): + # For each group which exists in Google or the managed group set which is eligible, + # determine if it needs updating/creating. If so, record a patch/insert for the group. + LOG.info('Calculating updates to groups...') + google_group_updates = {} + google_group_creations = {} + for gid, managed_group_entry in self.state.managed_group_entries_by_gid.items(): + # Form expected group resource fields. The 2 Google APIs we use here to update groups + # in Google each have different maximum lengths for group names and descriptions, and + # empirically the APIs don't function properly if either limit is exceeded, so we use + # the minimum of the 2 documented maximum field lengths (73 characters for names and + # 300 characters for descriptions). + # + # Note that the source of each of these groups may be either a Lookup group or a Lookup + # institution, which are handled the same here. Technically Lookup institutions do not + # have descriptions, but the code in ldap.py sets the description from the name for + # Lookup institutions, which is useful since some institution names do not fit in the + # Google name field. + expected_google_group = { + 'name': _trim_text( + managed_group_entry.groupName, maxlen=73, + suffix=self.sync_config.group_name_suffix + ), + 'description': _trim_text( + _clean_group_desc(managed_group_entry.description), + maxlen=300 + ) + } + + # Find existing Google group (if any). + existing_google_group = self.state.all_google_groups_by_gid.get(gid) + + if existing_google_group is not None: + # See if we need to change the existing group + # Unless anything needs changing, the patch is empty. + patch = {} + + if existing_google_group.get('name') != expected_google_group['name']: + patch['name'] = expected_google_group['name'] + if (existing_google_group.get('description') != + expected_google_group['description']): + patch['description'] = expected_google_group['description'] + + # Only record non-empty patches. + if len(patch) > 0: + google_group_updates[gid] = patch + else: + # No existing Google group, so create one. + google_group_creations[gid] = { + 'email': gid_to_email(gid, self.state.groups_domain, self.state.insts_domain), + **expected_google_group + } + + # Form a set of all the gids which need patching. + gids_to_update = set(google_group_updates.keys()) + LOG.info('Number of existing groups to update: %s', len(gids_to_update)) + + # Form a set of all the gids which need adding. + gids_to_add = set(google_group_creations.keys()) + LOG.info('Number of groups to add: %s', len(gids_to_add)) + + # Form a set of all gids which need deleting. + gids_to_delete = self.state.all_google_gids - self.state.eligible_gids + LOG.info('Number of groups to delete: %s', len(gids_to_delete)) + + # For each managed group, determine which members to insert or delete. These are lists of + # (gid, uid) tuples. + members_to_insert = [] + members_to_delete = [] + for gid, managed_group_entry in self.state.managed_group_entries_by_gid.items(): + # Find the existing Google group members. + existing_google_group = self.state.all_google_groups_by_gid.get(gid) + if existing_google_group: + existing_members = self.state.all_google_members[existing_google_group['id']] + existing_member_uids = set([email_to_uid(m['email']) for m in existing_members]) + else: + existing_member_uids = set() + + # Members to insert. This is restricted to the managed user set, so that we don't + # attempt to insert a member resource for a non-existent user. + insert_uids = ( + (managed_group_entry.uids - existing_member_uids) + .intersection(self.state.managed_user_uids) + ) + members_to_insert.extend([(gid, uid) for uid in insert_uids]) + + # Members to delete. This is restricted to the eligible user set, so that we don't + # bother to delete a member resource when the user is suspended (and so we won't need + # to re-add it if the user is reactivated). + delete_uids = ( + (existing_member_uids - managed_group_entry.uids) + .intersection(self.state.eligible_uids) + ) + members_to_delete.extend([(gid, uid) for uid in delete_uids]) + + LOG.info('Number of group members to insert: %s', len(members_to_insert)) + LOG.info('Number of group members to delete: %s', len(members_to_delete)) + + self.state.update({ + 'google_group_updates': google_group_updates, + 'google_group_creations': google_group_creations, + 'gids_to_update': gids_to_update, + 'gids_to_add': gids_to_add, + 'gids_to_delete': gids_to_delete, + 'members_to_insert': members_to_insert, + 'members_to_delete': members_to_delete, + }) + + def compare_groups_settings(self): + # Determine changes to existing group settings + group_settings_to_update = {} + for gid, settings in self.state.all_google_group_settings_by_gid.items(): + patch = { + k: v for k, v in self.sync_config.group_settings.items() + if settings.get(k) != v + } + if len(patch) > 0: + group_settings_to_update[gid] = patch + + gids_to_update_group_settings = set(group_settings_to_update.keys()) + LOG.info('Number of existing groups to update settings: %s', + len(gids_to_update_group_settings)) + + self.state.update({ + 'group_settings_to_update': group_settings_to_update, + 'gids_to_update_group_settings': gids_to_update_group_settings, + }) + + def enforce_limits(self, just_users): + # -------------------------------------------------------------------------------------------- + # Enforce limits on how much data to change in Google. + # -------------------------------------------------------------------------------------------- + + # Calculate percentage change to users, groups and group members. + user_change_percentage = 100. * ( + len(self.state.uids_to_add | self.state.uids_to_update | + self.state.uids_to_reactivate | self.state.uids_to_suspend) + / + max(1, len(self.state.all_google_uids)) + ) + LOG.info('Configuration will modify %.2f%% of users', user_change_percentage) + + if not just_users: + group_change_percentage = 100. * ( + len(self.state.gids_to_add | self.state.gids_to_update | self.state.gids_to_delete) + / + max(1, len(self.state.all_google_gids)) + ) + LOG.info('Configuration will modify %.2f%% of groups', group_change_percentage) + + member_change_percentage = 100. * ( + (len(self.state.members_to_insert) + len(self.state.members_to_delete)) + / + max(1, sum([len(m) for g, m in self.state.all_google_members.items()])) + ) + LOG.info('Configuration will modify %.2f%% of group members', member_change_percentage) + + # Enforce percentage change sanity checks. + if (self.limits_config.abort_user_change_percentage is not None and + user_change_percentage > self.limits_config.abort_user_change_percentage): + LOG.error( + 'Modification of %.2f%% of users is greater than limit of %.2f%%. Aborting.', + user_change_percentage, self.limits_config.abort_user_change_percentage + ) + raise RuntimeError('Aborting due to large user change percentage') + + if not just_users: + if (self.limits_config.abort_group_change_percentage is not None and + group_change_percentage > self.limits_config.abort_group_change_percentage): + LOG.error( + 'Modification of %.2f%% of groups is greater than limit of %.2f%%. Aborting.', + group_change_percentage, self.limits_config.abort_group_change_percentage + ) + raise RuntimeError('Aborting due to large group change percentage') + if (self.limits_config.abort_member_change_percentage is not None and + member_change_percentage > self.limits_config.abort_member_change_percentage): + LOG.error( + 'Modification of %.2f%% of group members is greater than limit of %.2f%%. ' + 'Aborting.', + member_change_percentage, self.limits_config.abort_member_change_percentage + ) + raise RuntimeError('Aborting due to large group member change percentage') + + # Cap maximum size of various operations. + if (self.limits_config.max_new_users is not None + and len(self.state.uids_to_add) > self.limits_config.max_new_users): + # Ensure that we do not attempt to insert a group member for any of the users not + # added as a result of this cap, since these users won't exist in Google + capped_uids_to_add = _limit(self.state.uids_to_add, self.limits_config.max_new_users) + uids_not_added = self.state.uids_to_add - capped_uids_to_add + if not just_users: + self.state.members_to_insert = [ + (g, u) for g, u in self.state.members_to_insert if u not in uids_not_added + ] + self.state.uids_to_add = capped_uids_to_add + LOG.info('Capped number of new users to %s', len(self.state.uids_to_add)) + + if (self.limits_config.max_suspended_users is not None and + len(self.state.uids_to_suspend) > self.limits_config.max_suspended_users): + self.state.uids_to_suspend = _limit( + self.state.uids_to_suspend, self.limits_config.max_suspended_users + ) + LOG.info('Capped number of users to suspend to %s', len(self.state.uids_to_suspend)) + if (self.limits_config.max_reactivated_users is not None and + len(self.state.uids_to_reactivate) > self.limits_config.max_reactivated_users): + self.state.uids_to_reactivate = _limit( + self.state.uids_to_reactivate, self.limits_config.max_reactivated_users + ) + LOG.info( + 'Capped number of users to reactivate to %s', + len(self.state.uids_to_reactivate) + ) + if (self.limits_config.max_updated_users is not None and + len(self.state.uids_to_update) > self.limits_config.max_updated_users): + self.state.uids_to_update = _limit( + self.state.uids_to_update, self.limits_config.max_updated_users + ) + LOG.info('Capped number of users to update to %s', len(self.state.uids_to_update)) + + if not just_users: + if (self.limits_config.max_new_groups is not None and + len(self.state.gids_to_add) > self.limits_config.max_new_groups): + # Ensure that we do not attempt to insert a group member for any of the groups not + # added as a result of this cap, since these groups won't exist in Google + capped_gids_to_add = _limit( + self.state.gids_to_add, self.limits_config.max_new_groups + ) + gids_not_added = self.state.gids_to_add - capped_gids_to_add + self.state.members_to_insert = [ + (g, u) for g, u in self.state.members_to_insert if g not in gids_not_added + ] + self.state.gids_to_add = capped_gids_to_add + LOG.info('Capped number of new groups to %s', len(self.state.gids_to_add)) + + if (self.limits_config.max_deleted_groups is not None and + len(self.state.gids_to_delete) > self.limits_config.max_deleted_groups): + self.state.gids_to_delete = _limit( + self.state.gids_to_delete, self.limits_config.max_deleted_groups + ) + LOG.info('Capped number of groups to delete to %s', len(self.state.gids_to_delete)) + if (self.limits_config.max_updated_groups is not None and + len(self.state.gids_to_update) > self.limits_config.max_updated_groups): + self.state.gids_to_update = _limit( + self.state.gids_to_update, self.limits_config.max_updated_groups + ) + LOG.info('Capped number of groups to update to %s', len(self.state.gids_to_update)) + if (self.limits_config.max_inserted_members is not None and + len(self.state.members_to_insert) > self.limits_config.max_inserted_members): + self.state.members_to_insert = ( + self.state.members_to_insert[0:self.limits_config.max_inserted_members] + ) + LOG.info( + 'Capped number of group members to insert to %s', + len(self.state.members_to_insert) + ) + if (self.limits_config.max_deleted_members is not None and + len(self.state.members_to_delete) > self.limits_config.max_deleted_members): + self.state.members_to_delete = ( + self.state.members_to_delete[0:self.limits_config.max_deleted_members] + ) + LOG.info( + 'Capped number of group members to delete to %s', + len(self.state.members_to_delete) + ) + + +def _limit(s, limit): + """ + Given a set, s, and a numeric limit, return a set which has no more than *limit* elements. The + exact set of elements retained is not specified. + + >>> s = set('ABCDEFGHIJKLMNOPQ') + >>> len(s) > 5 + True + >>> len(_limit(s, 5)) == 5 + True + >>> len(_limit(s, 500)) == len(s) + True + + All elements of the returned set are taken from input set. + + >>> s_prime = _limit(s, 5) + >>> s_prime - s + set() + + """ + return {e for _, e in itertools.takewhile(lambda p: p[0] < limit, enumerate(s))} + + +def _trim_text(text, *, maxlen, cont='...', suffix=''): + """ + Trim text to be no more than "maxlen" characters long, terminating it with "cont" if it had + to be truncated. If supplied, "suffix" is appended to the string after truncating, and the + truncation point adjusted so that the total length remains less than "maxlen". + + """ + return ( + text[0:maxlen-len(cont)-len(suffix)]+cont+suffix + if len(text)+len(suffix) > maxlen else text+suffix + ) + + +def _clean_group_desc(s): + """ + Clean any "bad characters" in group descriptions. + + Google support (https://support.google.com/a/answer/9193374) says: + "descriptions can’t contain equal signs (=), or brackets (<,>)" + + >>> _clean_group_desc('a<b>c=d') + 'abcd' + + """ + return ''.join(c for c in s if c not in _CLEAN_GROUP_DESC_BAD_CHARS) + + +# Characters stripped by _clean_group_desc. Present as a constant to avoid re-creating it. +_CLEAN_GROUP_DESC_BAD_CHARS = '=<>' diff --git a/gsuitesync/sync/gapi.py b/gsuitesync/sync/gapi.py new file mode 100644 index 0000000000000000000000000000000000000000..65c9f137ee243d623db17cbb1e7e26bd4a27ac02 --- /dev/null +++ b/gsuitesync/sync/gapi.py @@ -0,0 +1,267 @@ +""" +Load current user, group and institution data from Google. + +""" +import logging +import re + +from google.oauth2 import service_account +from googleapiclient import discovery + +from .base import ConfigurationStateConsumer +from .. import gapiutil +from .utils import email_to_uid, email_to_gid, groupID_regex, instID_regex + + +LOG = logging.getLogger(__name__) + +# Scopes required to perform read-only actions. +READ_ONLY_SCOPES = [ + 'https://www.googleapis.com/auth/admin.directory.user.readonly', + 'https://www.googleapis.com/auth/admin.directory.group.readonly', + 'https://www.googleapis.com/auth/admin.directory.group.member.readonly', + 'https://www.googleapis.com/auth/apps.groups.settings' +] + +# Scopes *in addition to READ_ONLY_SCOPES* required to perform a full update. +WRITE_SCOPES = [ + 'https://www.googleapis.com/auth/admin.directory.user', + 'https://www.googleapis.com/auth/admin.directory.group', + 'https://www.googleapis.com/auth/admin.directory.group.member' +] + + +class GAPIRetriever(ConfigurationStateConsumer): + required_config = ('gapi_auth', 'gapi_domain', 'sync') + + def connect(self, read_only=True): + # load credentials + self.creds = self._get_credentials(read_only) + # Build the directory service using Google API discovery. + directory_service = discovery.build('admin', 'directory_v1', credentials=self.creds) + + # Secondary domain for Google groups that come from Lookup groups + groups_domain = ( + self.gapi_domain_config.groups_domain + if self.gapi_domain_config.groups_domain is not None + else self.gapi_domain_config.name + ) + # Secondary domain for Google groups that come from Lookup institutions + insts_domain = ( + self.gapi_domain_config.insts_domain + if self.gapi_domain_config.insts_domain is not None + else self.gapi_domain_config.name + ) + + # Return components needed for connection with Google API + self.state.update({ + 'directory_service': directory_service, + 'groups_domain': groups_domain, + 'insts_domain': insts_domain, + }) + + def _get_credentials(self, read_only): + """ + Create a Google credentials object from the configuration. Use *read_only* to indicate if + read-only credentials are preferred. + + """ + # Load appropriate Google credentials. + creds_file = self.gapi_auth_config.credentials + if read_only and self.gapi_auth_config.read_only_credentials is not None: + creds = self.gapi_auth_config.read_only_credentials + LOG.info('Using read-only credentials.') + + LOG.info('Loading Google account credentials from "%s"', creds_file) + creds = service_account.Credentials.from_service_account_file(creds_file) + + # With scopes based on read_only + creds = creds.with_scopes(READ_ONLY_SCOPES + ([] if read_only else WRITE_SCOPES)) + + # Use admin_user if using service account with Domain-Wide Delegation + if self.gapi_domain_config.admin_user: + creds = creds.with_subject(self.gapi_domain_config.admin_user) + + return creds + + def retrieve_users(self): + # Retrieve information on all users excluding domain admins. + LOG.info('Getting information on Google domain users') + fields = [ + 'id', 'isAdmin', 'orgUnitPath', 'primaryEmail', 'suspended', 'suspensionReason', + 'name(givenName, familyName)', + ] + all_google_users = gapiutil.list_all( + self.state.directory_service.users().list, items_key='users', + domain=self.gapi_domain_config.name, + query='isAdmin=false', fields='nextPageToken,users(' + ','.join(fields) + ')', + retries=self.sync_config.http_retries, retry_delay=self.sync_config.http_retry_delay, + ) + # Strip any "to be ignored" users out of the results. + if self.sync_config.ignore_google_org_unit_path_regex is not None: + LOG.info( + 'Ignoring users whose organization unit path matches %r', + self.sync_config.ignore_google_org_unit_path_regex) + # Check that all users have an orgUnitPath + missing_org = [ + u for u in all_google_users if 'orgUnitPath' not in u + ] + if len(missing_org) != 0: + LOG.error( + 'User entries missing orgUnitPath: %s (starting with %s)', len(missing_org), + missing_org[0]['primaryEmail'] if 'primaryEmail' in missing_org[0] + else 'user with blank email' + ) + raise RuntimeError('Sanity check failed: at least one user is missing orgUnitPath') + # Remove users matching regex + regex = re.compile(self.sync_config.ignore_google_org_unit_path_regex) + all_google_users = [ + u for u in all_google_users if not regex.match(u['orgUnitPath']) + ] + + # Strip out any users with uids (extracted from the local-part of the email address) that + # aren't valid CRSids. These users can't have come from Lookup, and so should not be + # managed (suspended) by this script. + all_google_users = [ + u for u in all_google_users if email_to_uid(u['primaryEmail']) + ] + + # Sanity check. There should be no admins in the returned results. + if any(u.get('isAdmin', False) for u in all_google_users): + raise RuntimeError('Sanity check failed: admin users in user list') + + # Form mappings from uid to Google user. + all_google_users_by_uid = { + email_to_uid(u['primaryEmail']): u for u in all_google_users + } + + # Form sets of all Google-side uids. The all_google_uids set is all users including + # the suspended ones and the suspended_google_uids set is only the suspended users. Non + # suspended users are therefore all_google_uids - suspended_google_uids. + all_google_uids = set(all_google_users_by_uid.keys()) + suspended_google_uids = { + uid for uid, u in all_google_users_by_uid.items() if u['suspended'] + } + + # Sanity check. We should not have lost anything. (I.e. the uids should be unique.) + if len(all_google_uids) != len(all_google_users): + raise RuntimeError('Sanity check failed: user list changed length') + + # Log some stats. + LOG.info('Total Google users: %s', len(all_google_uids)) + LOG.info( + 'Suspended Google users: %s', + sum(1 if u['suspended'] else 0 for u in all_google_users) + ) + + self.state.update({ + 'all_google_users': all_google_users, + 'all_google_users_by_uid': all_google_users_by_uid, + 'all_google_uids': all_google_uids, + 'suspended_google_uids': suspended_google_uids, + }) + + def retrieve_groups(self): + # Retrieve information on all Google groups that come from Lookup groups + LOG.info('Getting information on Google domain groups') + all_google_groups = [ + g for g in self._fetch_groups(self.state.groups_domain) + if groupID_regex.match(g['email'].split('@')[0]) + ] + + # Append information on all Google groups that come from Lookup institutions + LOG.info('Getting information on Google domain institutions') + all_google_groups.extend([ + g for g in self._fetch_groups(self.state.insts_domain) + if instID_regex.match(g['email'].split('@')[0].upper()) + ]) + + # Strip out any groups whose email addresses don't match the pattern for groups created + # from Lookup groupIDs or instIDs, and which therefore should not be managed (deleted) by + # this script. + all_google_groups = [g for g in all_google_groups if email_to_gid(g['email'])] + + # Form mappings from gid to Google group. + all_google_groups_by_gid = { + email_to_gid(g['email']): g for g in all_google_groups + } + + # Form sets of all Google-side gids. The all_google_gids set includes both groupIDs and + # instIDs. Groups in Google do not have any concept of being suspended. + all_google_gids = set(all_google_groups_by_gid.keys()) + + # Sanity check. We should not have lost anything. (I.e. the gids should be unique.) + if len(all_google_gids) != len(all_google_groups): + raise RuntimeError('Sanity check failed: group list changed length') + + # Retrieve all Google group memberships. This is a mapping from internal Google group ids + # to lists of member resources, corresponding to both Lookup groups and institutions. + fields = ['id', 'email'] + all_google_members = gapiutil.list_all_in_list( + self.state.directory_service, self.state.directory_service.members().list, + item_ids=[g['id'] for g in all_google_groups], id_key='groupKey', + batch_size=self.sync_config.batch_size, items_key='members', + fields='nextPageToken,members(' + ','.join(fields) + ')', + retries=self.sync_config.http_retries, retry_delay=self.sync_config.http_retry_delay, + ) + + # Sanity check. We should have a group members list for each managed group. + if len(all_google_members) != len(all_google_groups): + raise RuntimeError( + 'Sanity check failed: groups in members map do not match group list') + + # Log some stats. + LOG.info('Total Google groups: %s', len(all_google_gids)) + LOG.info( + 'Total Google group members: %s', + sum([len(m) for g, m in all_google_members.items()]) + ) + + self.state.update({ + 'all_google_groups': all_google_groups, + 'all_google_groups_by_gid': all_google_groups_by_gid, + 'all_google_gids': all_google_gids, + 'all_google_members': all_google_members, + }) + + def retrieve_group_settings(self): + # Build the groupssettings service, which is a parallel API to manage group settings + groupssettings_service = discovery.build( + 'groupssettings', 'v1', credentials=self.creds + ) + # Retrieve all Google group settings. + fields = ['email', *[k for k in self.sync_config.group_settings.keys()]] + all_google_group_settings = gapiutil.get_all_in_list( + groupssettings_service, groupssettings_service.groups().get, + item_ids=[g['email'] for g in self.state.all_google_groups], id_key='groupUniqueId', + batch_size=self.sync_config.batch_size, fields=','.join(fields), + retries=self.sync_config.http_retries, retry_delay=self.sync_config.http_retry_delay, + ) + + # Form a mapping from gid to Google group settings. + all_google_group_settings_by_gid = { + email_to_gid(g['email']): g for g in all_google_group_settings + } + + # Sanity check. We should have settings for each managed group. + if len(all_google_group_settings_by_gid) != len(self.state.all_google_groups): + raise RuntimeError( + 'Sanity check failed: group settings list does not match group list' + ) + + self.state.update({ + 'groupssettings_service': groupssettings_service, + 'all_google_group_settings_by_gid': all_google_group_settings_by_gid, + }) + + def _fetch_groups(self, domain): + """ + Function to fetch Google group information from the specified domain + + """ + fields = ['id', 'email', 'name', 'description'] + return gapiutil.list_all( + self.state.directory_service.groups().list, items_key='groups', domain=domain, + fields='nextPageToken,groups(' + ','.join(fields) + ')', + retries=self.sync_config.http_retries, retry_delay=self.sync_config.http_retry_delay, + ) diff --git a/gsuitesync/sync/ldap.py b/gsuitesync/sync/ldap.py new file mode 100644 index 0000000000000000000000000000000000000000..53230613064f0647a1004590e64910a74cf7a6a3 --- /dev/null +++ b/gsuitesync/sync/ldap.py @@ -0,0 +1,288 @@ +""" +Load current user, group and institution data from Lookup. + +""" +import logging +import collections +import ldap3 + +from .base import ConfigurationStateConsumer + +LOG = logging.getLogger(__name__) + +# User and group information we need to populate the Google user directory. +UserEntry = collections.namedtuple('UserEntry', 'uid cn sn displayName givenName') +GroupEntry = collections.namedtuple('GroupEntry', 'groupID groupName description uids') + + +class LDAPRetriever(ConfigurationStateConsumer): + required_config = ('ldap', ) + + def retrieve_users(self): + # Get a set containing all CRSids. These are all the people who are eligible to be in our + # GSuite instance. If a user is in GSuite and is *not* present in this list then they are + # suspended. + LOG.info('Reading eligible user entries from LDAP') + eligible_uids = self.get_eligible_uids() + LOG.info('Total LDAP user entries: %s', len(eligible_uids)) + + # Sanity check: there are some eligible users (else LDAP lookup failure?) + if len(eligible_uids) == 0: + raise RuntimeError('Sanity check failed: no users in eligible set') + + # Get a list of managed users. These are all the people who match the "managed_user_filter" + # in the LDAP settings. + LOG.info('Reading managed user entries from LDAP') + managed_user_entries = self.get_managed_user_entries() + + # Form a mapping from uid to managed user. + managed_user_entries_by_uid = {u.uid: u for u in managed_user_entries} + + # Form a set of all *managed user* uids + managed_user_uids = set(managed_user_entries_by_uid.keys()) + LOG.info('Total managed user entries: %s', len(managed_user_uids)) + + # Sanity check: the managed users should be a subset of the eligible ones. + if len(managed_user_uids - eligible_uids) != 0: + raise RuntimeError( + 'Sanity check failed: some managed uids were not in the eligible set' + ) + + self.state.update({ + 'eligible_uids': eligible_uids, + 'managed_user_entries_by_uid': managed_user_entries_by_uid, + 'managed_user_uids': managed_user_uids, + }) + + def retrieve_groups(self): + # Get a set containing all groupIDs. These are all the groups that are eligible to be in + # our GSuite instance. If a group is in GSuite and is *not* present in this list then it + # is deleted. + LOG.info('Reading eligible group entries from LDAP') + eligible_groupIDs = self.get_eligible_groupIDs() + LOG.info('Total LDAP group entries: %s', len(eligible_groupIDs)) + + # Get a set containing all instIDs. These are all the institutions that are eligible to be + # in our GSuite instance. If an institution is in GSuite and is *not* present in this list + # then the corresponding group is deleted. + LOG.info('Reading eligible institution entries from LDAP') + eligible_instIDs = self.get_eligible_instIDs() + LOG.info('Total LDAP institution entries: %s', len(eligible_instIDs)) + + # Add these sets together to form the set of all gids (the IDs of all eligible groups and + # institutions). + eligible_gids = eligible_groupIDs | eligible_instIDs + LOG.info('Total combined LDAP group and institution entries: %s', len(eligible_gids)) + + # Get a list of managed groups. These are all the groups that match the + # "managed_group_filter" in the LDAP settings. + LOG.info('Reading managed group entries from LDAP') + managed_group_entries = self.get_managed_group_entries() + + # Form a mapping from groupID to managed group. + managed_group_entries_by_groupID = {g.groupID: g for g in managed_group_entries} + + # Form a set of all *managed group* groupIDs + managed_group_groupIDs = set(managed_group_entries_by_groupID.keys()) + LOG.info('Total managed group entries: %s', len(managed_group_groupIDs)) + LOG.info( + 'Total managed group members: %s', + sum([len(g.uids) for g in managed_group_entries]) + ) + + # Get a list of managed institutions. These are all the institutions that match the + # "managed_inst_filter" in the LDAP settings. + LOG.info('Reading managed institution entries from LDAP') + managed_inst_entries = self.get_managed_inst_entries() + + # Form a mapping from instID to managed institution. + managed_inst_entries_by_instID = {i.groupID: i for i in managed_inst_entries} + + # Form a set of all *managed institution* instIDs + managed_inst_instIDs = set(managed_inst_entries_by_instID.keys()) + LOG.info('Total managed institution entries: %s', len(managed_inst_instIDs)) + LOG.info( + 'Total managed institution members: %s', + sum([len(i.uids) for i in managed_inst_entries]) + ) + + # Add the collections of managed institutions to the collections of managed groups. + managed_group_entries += managed_inst_entries + managed_group_entries_by_gid = { + **managed_group_entries_by_groupID, **managed_inst_entries_by_instID + } + managed_group_gids = managed_group_groupIDs | eligible_instIDs + LOG.info( + 'Total combined managed group and institution entries: %s', len(managed_group_gids) + ) + LOG.info( + 'Total combined managed group and institution members: %s', + sum([len(g.uids) for g in managed_group_entries]) + ) + + # Sanity check: the managed groups should be a subset of the eligible ones. + if len(managed_group_gids - eligible_gids) != 0: + raise RuntimeError( + 'Sanity check failed: some managed gids were not in the eligible set' + ) + + self.state.update({ + 'eligible_gids': eligible_gids, + 'managed_group_entries_by_gid': managed_group_entries_by_gid, + }) + + ### + # Functions to perform LDAP calls + ### + def get_eligible_uids(self): + """ + Return a set containing all uids who are eligible to have a Google account. + + """ + return { + e['attributes']['uid'][0] + for e in self._search( + search_base=self.ldap_config.user_search_base, + search_filter=self.ldap_config.eligible_user_filter, + attributes=['uid'] + ) + } + + def get_eligible_groupIDs(self): + """ + Return a set containing all groupIDs that are eligible for Google. + + """ + return { + e['attributes']['groupID'][0] + for e in self._search( + search_base=self.ldap_config.group_search_base, + search_filter=self.ldap_config.eligible_group_filter, + attributes=['groupID'] + ) + } + + def get_eligible_instIDs(self): + """ + Return a set containing all instIDs that are eligible for Google. + + """ + return { + e['attributes']['instID'][0] + for e in self._search( + search_base=self.ldap_config.inst_search_base, + search_filter=self.ldap_config.eligible_inst_filter, + attributes=['instID'] + ) + } + + def get_managed_user_entries(self): + """ + Return a list containing all managed user entries as UserEntry instances. + + """ + search_filter = ( + self.ldap_config.managed_user_filter + if self.ldap_config.managed_user_filter is not None + else self.ldap_config.eligible_user_filter + ) + return [ + UserEntry( + uid=_extract(e, 'uid'), cn=_extract(e, 'cn'), sn=_extract(e, 'sn'), + displayName=_extract(e, 'displayName'), givenName=_extract(e, 'givenName') + ) + for e in self._search( + search_base=self.ldap_config.user_search_base, search_filter=search_filter, + attributes=['uid', 'cn', 'sn', 'displayName', 'givenName'] + ) + ] + + def get_managed_group_entries(self): + """ + Return a list containing all managed group entries as GroupEntry instances. + + """ + search_filter = ( + self.ldap_config.managed_group_filter + if self.ldap_config.managed_group_filter is not None + else self.ldap_config.eligible_group_filter + ) + return [ + GroupEntry( + groupID=_extract(e, 'groupID'), groupName=_extract(e, 'groupName'), + description=_extract(e, 'description'), uids=set(e['attributes'].get('uid', [])) + ) + for e in self._search( + search_base=self.ldap_config.group_search_base, search_filter=search_filter, + attributes=['groupID', 'groupName', 'description', 'uid'] + ) + ] + + def get_managed_inst_entries(self): + """ + Return a list containing all managed institution entries as GroupEntry instances. + + Note that we return GroupEntry instances here since Lookup institutions become groups in + Google, and this simplifies the sync code by allowing us to handle institutions in the same + way as groups. The GroupEntry's groupID and groupName fields will be the institution's + instID and ou (name) respectively. Since Lookup institutions don't have descriptions, we + set the description field to the institution's name as well (in Google, the description + allows longer strings, and so will not truncate the name). + + """ + # This requires 2 LDAP queries. First find the managed institutions. + search_filter = ( + self.ldap_config.managed_inst_filter + if self.ldap_config.managed_inst_filter is not None + else self.ldap_config.eligible_inst_filter + ) + managed_insts = [ + GroupEntry( + groupID=_extract(e, 'instID'), groupName=_extract(e, 'ou'), + description=_extract(e, 'ou'), uids=set(), + ) + for e in self._search( + search_base=self.ldap_config.inst_search_base, search_filter=search_filter, + attributes=['instID', 'ou'] + ) + ] + managed_insts_by_instID = {g.groupID: g for g in managed_insts} + + # Then get each eligible user's list of institutions and use that data to populate each + # institution's uid list. + eligible_users = self._search( + search_base=self.ldap_config.user_search_base, + search_filter=self.ldap_config.eligible_user_filter, + attributes=['uid', 'instID'] + ) + for e in eligible_users: + uid = e['attributes']['uid'][0] + for instID in e['attributes']['instID']: + if instID in managed_insts_by_instID: + managed_insts_by_instID[instID].uids.add(uid) + + return managed_insts + + def _search(self, *, search_base, search_filter, attributes): + # Use SSL to access the LDAP server when authentication credentials + # have been configured + use_ssl = bool(self.ldap_config.username and self.ldap_config.password) + ldap_server = ldap3.Server(self.ldap_config.host, use_ssl=use_ssl) + + # Add authentication credentials if configured + username = self.ldap_config.username if self.ldap_config.username else None + password = self.ldap_config.password if self.ldap_config.password else None + + # Connect to the LDAP server and perform the query + with ldap3.Connection(ldap_server, username, password, auto_bind=True) as conn: + return conn.extend.standard.paged_search( + search_base, search_filter, paged_size=1000, attributes=attributes) + + +def _extract(entry, attr, *, default=''): + vs = entry['attributes'].get(attr, []) + if len(vs) == 0: + return default + if isinstance(vs, str): + return vs + return vs[0] diff --git a/gsuitesync/sync/main.py b/gsuitesync/sync/main.py new file mode 100644 index 0000000000000000000000000000000000000000..657fde9f69b86da9704ef5133397ab81ff71c4ae --- /dev/null +++ b/gsuitesync/sync/main.py @@ -0,0 +1,62 @@ +""" +Synchronise Google Directory with a local LDAP directory. + +""" +import logging + +from .. import config +from .state import SyncState +from .ldap import LDAPRetriever +from .gapi import GAPIRetriever +from .compare import Comparator +from .update import GAPIUpdater + +LOG = logging.getLogger(__name__) + + +def sync(configuration, *, read_only=True, group_settings=False, just_users=False): + """Perform sync given configuration dictionary.""" + if read_only: + LOG.info('Performing synchronisation in READ ONLY mode.') + else: + LOG.info('Performing synchronisation in WRITE mode.') + + # Parse configuration into Configuration dict of appropriate dataclasses + configuration = config.parse_configuration(configuration) + + # Class to hold all state that can be updated by the process below then + # used to do updates + state = SyncState() + + # Get users and optionally groups from Lookup + ldap = LDAPRetriever(configuration, state) + ldap.retrieve_users() + if not just_users: + ldap.retrieve_groups() + + # Get users and optionally groups from Google + gapi = GAPIRetriever(configuration, state) + gapi.connect(read_only) + gapi.retrieve_users() + if not just_users: + gapi.retrieve_groups() + # Optionally get group settings too + if group_settings: + gapi.retrieve_group_settings() + + # Compare users and optionally groups between Lookup and Google + comparator = Comparator(configuration, state) + comparator.compare_users() + if not just_users: + comparator.compare_groups() + # Optionally compare existing group settings too + if group_settings: + comparator.compare_groups_settings() + # Enforce creation/update limits + comparator.enforce_limits(just_users) + + # Update Google with necessary updates found doing comparison + updater = GAPIUpdater(configuration, state, read_only) + updater.update_users() + if not just_users: + updater.update_groups() diff --git a/gsuitesync/sync/state.py b/gsuitesync/sync/state.py new file mode 100644 index 0000000000000000000000000000000000000000..480adf862cfb62e293c311e19d9e54379573ec7b --- /dev/null +++ b/gsuitesync/sync/state.py @@ -0,0 +1,82 @@ +""" +A dataclass to hold the built up state of Lookup and Google data and needed updates + +""" +from typing import Optional +from dataclasses import dataclass, field +from googleapiclient import discovery + + +@dataclass +class SyncState: + ################ + # Data retrieved from Lookup + ################ + + # user data + eligible_uids: set = field(default_factory=set) + managed_user_entries_by_uid: dict = field(default_factory=dict) + managed_user_uids: set = field(default_factory=set) + # group data + eligible_gids: set = field(default_factory=set) + managed_group_entries_by_gid: dict = field(default_factory=dict) + + ################ + # Components needed when communicating with Google API + ################ + directory_service: Optional[discovery.Resource] = None + groupssettings_service: Optional[discovery.Resource] = None + groups_domain: str = '' + insts_domain: str = '' + + ################ + # Data retrieved from Google + ################ + + # user data + all_google_users: list = field(default_factory=list) + all_google_users_by_uid: dict = field(default_factory=dict) + all_google_uids: set = field(default_factory=set) + suspended_google_uids: set = field(default_factory=set) + # group data + all_google_groups: list = field(default_factory=list) + all_google_groups_by_gid: dict = field(default_factory=dict) + all_google_gids: set = field(default_factory=set) + # group membership data + all_google_members: dict = field(default_factory=dict) + # group settings data + all_google_group_settings_by_gid: dict = field(default_factory=dict) + + ################ + # Results of comparison + ################ + + # updates to users + google_user_updates: dict = field(default_factory=dict) + google_user_creations: dict = field(default_factory=dict) + uids_to_update: set = field(default_factory=set) + uids_to_add: set = field(default_factory=set) + uids_to_reactivate: set = field(default_factory=set) + uids_to_suspend: set = field(default_factory=set) + # updates to groups + google_group_updates: dict = field(default_factory=dict) + google_group_creations: dict = field(default_factory=dict) + gids_to_update: set = field(default_factory=set) + gids_to_add: set = field(default_factory=set) + gids_to_delete: set = field(default_factory=set) + # updates to group memberships + members_to_insert: list = field(default_factory=list) + members_to_delete: list = field(default_factory=list) + # updates to group settings + group_settings_to_update: dict = field(default_factory=dict) + gids_to_update_group_settings: set = field(default_factory=set) + + ################ + # Allow easy updating from dict + ################ + def update(self, data: dict): + for key, value in data.items(): + if hasattr(self, key): + setattr(self, key, value) + else: + raise RuntimeError(f"Attempt to add invalid key '{key}' to state") diff --git a/gsuitesync/sync/update.py b/gsuitesync/sync/update.py new file mode 100644 index 0000000000000000000000000000000000000000..d68152f4c5d2ff3ed2ce5a3a64fb9bd8ac6167a6 --- /dev/null +++ b/gsuitesync/sync/update.py @@ -0,0 +1,162 @@ +""" +Perform the actual updates in Google (unless in read_only mode) + +""" +import logging +import crypt +import secrets + +from .base import ConfigurationStateConsumer +from .utils import gid_to_email, uid_to_email +from ..gapiutil import process_requests + +LOG = logging.getLogger(__name__) + + +class GAPIUpdater(ConfigurationStateConsumer): + required_config = ('sync', 'gapi_domain') + + def __init__(self, configuration, state, read_only=True): + super(GAPIUpdater, self).__init__(configuration, state) + self.read_only = read_only + + def update_users(self): + process_requests( + self.state.directory_service, + self.user_api_requests(), + self.sync_config, self.read_only) + + def update_groups(self): + process_requests( + self.state.directory_service, + self.group_api_requests(), + self.sync_config, self.read_only) + # Still need to do this even if `not group_settings` as new groups need their settings + process_requests( + self.state.groupssettings_service, + self.group_settings_api_requests(), + self.sync_config, self.read_only) + process_requests( + self.state.directory_service, + self.member_api_requests(), + self.sync_config, self.read_only) + + def user_api_requests(self): + """ + A generator which will generate patch() and insert() calls to the directory service to + perform the actions required to update users + + """ + # Update existing users. + user_updates = { + uid: self.state.google_user_updates[uid] for uid in self.state.uids_to_update + } + for uid, update in user_updates.items(): + google_id = self.state.all_google_users_by_uid[uid]['id'] + # Only show the previous parts of name that have been changed + updated_google_user_name = update.get('name', {}) + previous_google_user_name = self.state.all_google_users_by_uid[uid].get('name', {}) + previous = { + k: previous_google_user_name.get(k, '') + for k in ['givenName', 'familyName'] + if k in updated_google_user_name + } + LOG.info('Update user "%s": "%r" from "%r"', uid, update, previous) + yield self.state.directory_service.users().patch(userKey=google_id, body=update) + + # Suspend old users + for uid in self.state.uids_to_suspend: + google_id = self.state.all_google_users_by_uid[uid]['id'] + LOG.info('Suspending user: "%s"', uid) + yield self.state.directory_service.users().patch( + userKey=google_id, body={'suspended': True}) + + # Reactivate returning users + for uid in self.state.uids_to_reactivate: + google_id = self.state.all_google_users_by_uid[uid]['id'] + LOG.info('Reactivating user: "%s"', uid) + yield self.state.directory_service.users().patch( + userKey=google_id, body={'suspended': False}) + + # Create new users + for uid in self.state.uids_to_add: + # Generate a random password which is thrown away. + new_user = {**{ + 'hashFunction': 'crypt', + 'password': crypt.crypt(secrets.token_urlsafe(), crypt.METHOD_SHA512), + 'orgUnitPath': self.sync_config.new_user_org_unit_path, + }, **self.state.google_user_creations[uid]} + redacted_user = {**new_user, **{'password': 'REDACTED'}} + LOG.info('Adding user "%s": %s', uid, redacted_user) + yield self.state.directory_service.users().insert(body=new_user) + + def group_api_requests(self): + """ + A generator which will generate patch(), insert() and delete() calls to the directory + service to perform the actions required to update groups + + """ + # Update existing groups + group_updates = { + gid: self.state.google_group_updates[gid] for gid in self.state.gids_to_update + } + for gid, update in group_updates.items(): + google_id = self.state.all_google_groups_by_gid[gid]['id'] + LOG.info('Update group "%s": "%r"', gid, update) + yield self.state.directory_service.groups().patch(groupKey=google_id, body=update) + + # Delete cancelled groups + for gid in self.state.gids_to_delete: + google_id = self.state.all_google_groups_by_gid[gid]['id'] + LOG.info('Deleting group: "%s"', gid) + yield self.state.directory_service.groups().delete(groupKey=google_id) + + # Create new groups + for gid in self.state.gids_to_add: + new_group = self.state.google_group_creations[gid] + LOG.info('Adding group "%s": %s', gid, new_group) + yield self.state.directory_service.groups().insert(body=new_group) + + def member_api_requests(self): + """ + A generator which will generate insert() and delete() calls to the directory service to + perform the actions required to update group members + + """ + # Insert new members + for gid, uid in self.state.members_to_insert: + group_key = gid_to_email(gid, self.state.groups_domain, self.state.insts_domain) + user_key = uid_to_email(uid, self.gapi_domain_config.name) + LOG.info('Adding user "%s" to group "%s"', user_key, group_key) + yield self.state.directory_service.members().insert( + groupKey=group_key, body={'email': user_key}) + + # Delete removed members + for gid, uid in self.state.members_to_delete: + group_key = gid_to_email(gid, self.state.groups_domain, self.state.insts_domain) + user_key = uid_to_email(uid, self.gapi_domain_config.name) + LOG.info('Removing user "%s" from group "%s"', user_key, group_key) + yield self.state.directory_service.members().delete( + groupKey=group_key, memberKey=user_key) + + def group_settings_api_requests(self): + """ + A generator which will generate patch() calls to the groupssettings service to set or + update the required group settings. + + """ + # Apply all settings to new groups. + for gid in self.state.gids_to_add: + email = gid_to_email(gid, self.state.groups_domain, self.state.insts_domain) + settings = self.sync_config.group_settings + LOG.info('Updating settings for new group "%s": %s', gid, settings) + yield self.state.groupssettings_service.groups().patch( + groupUniqueId=email, body=settings) + + # Update existing group settings (will be empty of `not group_settings`) + for gid in self.state.gids_to_update_group_settings: + email = gid_to_email(gid, self.state.groups_domain, self.state.insts_domain) + settings = self.state.group_settings_to_update[gid] + LOG.info('Updating settings for existing group "%s": %s', gid, settings) + yield self.state.groupssettings_service.groups().patch( + groupUniqueId=email, body=settings) diff --git a/gsuitesync/sync/utils.py b/gsuitesync/sync/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..779e8e90a323f5a3a5b64e00888bfcc48aa5c063 --- /dev/null +++ b/gsuitesync/sync/utils.py @@ -0,0 +1,48 @@ +import logging +import re + +LOG = logging.getLogger(__name__) + + +# Functions to translate the unique identifiers of users, groups and institutions in Lookup +# (uids, groupIDs and instIDs) to and from the unique identifiers used in Google (email +# addresses). +# +# For users: {uid} <-> {uid}@{domain} +# For groups: {groupID} <-> {groupID}@{groups_domain} +# For insts: {instID} <-> {instID.lower()}@{insts_domain} (local part must be lowercase) +# +# Additionally, valid uids (CRSids) match the regex [a-z][a-z0-9]{3,7}, valid groupIDs match +# the regex [0-9]{6,8} and valid instIDs match the regex [A-Z][A-Z0-9]+. +# +# Since Lookup institutions become groups in Google, we use common code to sync all Google +# groups, regardless of whether they were groups or institutions in Lookup. In all the code +# that follows, we use "gid" to refer to the unique identifier of the group or institution in +# Lookup (i.e., gid may be either a Lookup groupID or instID). + +user_email_regex = re.compile('^[a-z][a-z0-9]{3,7}@.*$') +groupID_regex = re.compile('^[0-9]{6,8}$') +instID_regex = re.compile('^[A-Z][A-Z0-9]+$') + + +def email_to_uid(email): + return email.split('@')[0] if user_email_regex.match(email) else None + + +def email_to_gid(email): + gid = email.split('@')[0] + return ( + gid if groupID_regex.match(gid) else + gid.upper() if instID_regex.match(gid.upper()) else None + ) + + +def uid_to_email(uid, domain): + return f'{uid}@{domain}' + + +def gid_to_email(gid, groups_domain, insts_domain): + return ( + f'{gid}@{groups_domain}' if groupID_regex.match(gid) else + f'{gid.lower()}@{insts_domain}' if instID_regex.match(gid) else None + )