From 2143edc54a2db1fcccab6de37453b685025d56f6 Mon Sep 17 00:00:00 2001 From: Mo Alsad <mo.alsad@gmail.com> Date: Mon, 20 Jun 2022 00:40:36 +0100 Subject: [PATCH 1/4] Add conversion code --- .gitignore | 156 ++++++++++++++++++ README.md | 99 +++-------- imc2zarr/__init__.py | 33 ++++ imc2zarr/converter.py | 64 ++++++++ imc2zarr/imclib/LICENSE | 21 +++ imc2zarr/imclib/__init__.py | 0 imc2zarr/imclib/imcdataparser.py | 165 +++++++++++++++++++ imc2zarr/imclib/imcraw.py | 171 +++++++++++++++++++ imc2zarr/imclib/mcdmeta.py | 213 ++++++++++++++++++++++++ imc2zarr/imclib/mcdutils.py | 272 +++++++++++++++++++++++++++++++ imc2zarr/imclib/mcdxmlparser.py | 114 +++++++++++++ imc2zarr/imclib/metadefs.py | 71 ++++++++ pyproject.toml | 3 + setup.cfg | 40 +++++ test.py | 4 + 15 files changed, 1350 insertions(+), 76 deletions(-) create mode 100644 .gitignore create mode 100644 imc2zarr/__init__.py create mode 100644 imc2zarr/converter.py create mode 100644 imc2zarr/imclib/LICENSE create mode 100644 imc2zarr/imclib/__init__.py create mode 100644 imc2zarr/imclib/imcdataparser.py create mode 100644 imc2zarr/imclib/imcraw.py create mode 100644 imc2zarr/imclib/mcdmeta.py create mode 100644 imc2zarr/imclib/mcdutils.py create mode 100644 imc2zarr/imclib/mcdxmlparser.py create mode 100644 imc2zarr/imclib/metadefs.py create mode 100644 pyproject.toml create mode 100644 setup.cfg create mode 100644 test.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a14d6d0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,156 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +.idea/ diff --git a/README.md b/README.md index cf419ff..828ab17 100644 --- a/README.md +++ b/README.md @@ -1,92 +1,39 @@ # IMC to Zarr converter +Convert IMC scan dataset to Zarr. - -## Getting started - -To make it easy for you to get started with GitLab, here's a list of recommended next steps. - -Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)! - -## Add your files - -- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files -- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command: +## Install ``` -cd existing_repo -git remote add origin https://gitlab.developers.cam.ac.uk/astronomy/camcead/imaxt/imc2zarr.git -git branch -M main -git push -uf origin main +pip install imc2zarr ``` -## Integrate with your tools - -- [ ] [Set up project integrations](https://gitlab.developers.cam.ac.uk/astronomy/camcead/imaxt/imc2zarr/-/settings/integrations) - -## Collaborate with your team - -- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/) -- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html) -- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically) -- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/) -- [ ] [Automatically merge when pipeline succeeds](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html) - -## Test and Deploy - -Use the built-in continuous integration in GitLab. - -- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html) -- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing(SAST)](https://docs.gitlab.com/ee/user/application_security/sast/) -- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html) -- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/) -- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html) - -*** - -# Editing this README +### Requirements -When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thank you to [makeareadme.com](https://www.makeareadme.com/) for this template. - -## Suggestions for a good README -Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information. - -## Name -Choose a self-explaining name for your project. - -## Description -Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors. - -## Badges -On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge. - -## Visuals -Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method. - -## Installation -Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection. +* click +* numpy +* pandas +* python_dateutil +* xarray ## Usage -Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README. -## Support -Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc. +### Arguments +* input_path: + * the root folder of the IMC scan containing a single mcd file and/or other related files: XML meta & scan data in text format + * or, the path of an mcd file +* output_path: the location where to store the converted output in Zarr format -## Roadmap -If you have ideas for releases in the future, it is a good idea to list them in the README. +### From Python script -## Contributing -State if you are open to contributions and what your requirements are for accepting them. - -For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self. - -You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser. +``` +from imc2zarr import imc2zarr -## Authors and acknowledgment -Show your appreciation to those who have contributed to the project. +imc2zarr(input_path, output_path) +``` -## License -For open source projects, say how it is licensed. +### From the command line -## Project status -If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers. +``` +imc2zarr input_path output_path +``` diff --git a/imc2zarr/__init__.py b/imc2zarr/__init__.py new file mode 100644 index 0000000..e094bc8 --- /dev/null +++ b/imc2zarr/__init__.py @@ -0,0 +1,33 @@ +import traceback +from pathlib import Path + +import click + +from .converter import Imc2Zarr + +__version__ = "0.1.0" +__author__ = "Mo Alsad and Eduardo Gonzalez Solares" +__email__ = "msa51@cam.ac.uk" +_credits__ = [ + "Mo Alsad", + "Eduardo Gonzalez Solares", + "Vito Zanotelli", + "Anton Rau", + "Jonas Windhager" +] + + +def imc2zarr(input_path: Path, output_path: Path): + try: + imc2zarr_converter = Imc2Zarr(input_path, output_path) + imc2zarr_converter.convert() + except Exception as err: + print('Error: {}'.format(str(err))) + print('Details: {}'.format(traceback.format_exc())) + + +@click.command() +@click.argument('input_path') +@click.argument('output_path') +def main(input_path, output_path): + imc2zarr(input_path, output_path) diff --git a/imc2zarr/converter.py b/imc2zarr/converter.py new file mode 100644 index 0000000..4774cb5 --- /dev/null +++ b/imc2zarr/converter.py @@ -0,0 +1,64 @@ +from pathlib import Path +from contextlib import closing +import json + +import xarray as xr + +from .imclib.imcraw import ImcRaw + + +class Imc2Zarr: + + def __init__(self, input_dir, output_dir): + self.input_dir = Path(input_dir) + self.output_dir = Path(output_dir) + + def convert(self): + try: + with closing(ImcRaw(self.input_dir)) as imc: + # assign output filename based on IMC run timestamp + input_name = self.input_dir.name + if self.input_dir.is_file(): + input_name = input_name[:-len(self.input_dir.suffix)] + self.output_fn = self.output_dir.joinpath('{}_{}.zarr'.format(input_name, imc.code)) + # save acquisitions into Zarr + self._convert2zarr(imc) + # save raw met and snapshots + self._save_auxiliary_data(imc) + except Exception as e: + raise e + + def _convert2zarr(self, imc: ImcRaw): + ds = xr.Dataset() + # set meta for root + ds.attrs['meta'] = [json.loads(json.dumps(imc.meta_summary, default=str))] + ds.attrs['raw_meta'] = imc.rawmeta + ds.to_zarr(self.output_fn, mode='w') + # loop over all acquisitions to read and store channel data + for q in imc.acquisitions: + data = imc.get_acquisition_data(q) + nchannels, ny, nx = data.shape + q_name = 'Q{}'.format(str(q.id).zfill(3)) + ds_q = xr.Dataset() + arr = xr.DataArray( + data, + dims=('channel', 'y', 'x'), + name='data', + coords={ + 'channel': range(nchannels), + 'y': range(ny), + 'x': range(nx) + }, + ) + arr.attrs['meta'] = [json.loads(json.dumps(q.meta_summary, default=str))] + ds_q[q_name] = arr + ds_q.attrs['meta'] = arr.attrs['meta'] + # append acquisition to existing dataset + ds_q.to_zarr(self.output_fn, group=q_name, mode='a') + + def _save_auxiliary_data(self, imc: ImcRaw): + # save raw meta as xml file + imc.save_meta_xml(self.output_fn) + # save snapshots + snapshot_dir = self.output_fn.joinpath('snapshots') + imc.save_snapshot_images(snapshot_dir) \ No newline at end of file diff --git a/imc2zarr/imclib/LICENSE b/imc2zarr/imclib/LICENSE new file mode 100644 index 0000000..2ea7145 --- /dev/null +++ b/imc2zarr/imclib/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021, University of Zurich + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/imc2zarr/imclib/__init__.py b/imc2zarr/imclib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/imc2zarr/imclib/imcdataparser.py b/imc2zarr/imclib/imcdataparser.py new file mode 100644 index 0000000..2e348b0 --- /dev/null +++ b/imc2zarr/imclib/imcdataparser.py @@ -0,0 +1,165 @@ +import re, os +from pathlib import Path +import xml.etree.ElementTree as et +import time + +import numpy as np + +from .mcdutils import McdUtils +from .mcdxmlparser import McdXmlParser +from .mcdmeta import Slide, Panorama, Acquisition +from .metadefs import * + + +class ImcDataParser: + """Parsing data from Fluidigm MCD files""" + + def __init__(self, mcdfilename, *, metafilename=None, textfilenames=None): + """ + :param filename: MCD filename + :param metafilename: in case of a separate meta filename + :param textfilename: filename of scan data in text format + """ + self._mcd_fsize = os.path.getsize(mcdfilename) + self._mcd_fh = open(Path(mcdfilename), mode="rb") + if metafilename is None: + self._meta_fh = self._mcd_fh + else: + self._meta_fh = open(Path(metafilename), mode="rb") + self._txt_fhs = [] + for tf in textfilenames: + self._txt_fhs.append(open(Path(tf), mode="r")) + self._xml = None + self._ns = None + self._use_mmap = True # awlays use memorymaps + self.meta = None + + def read_mcd_xml(self): + if self._use_mmap: + xml = McdUtils.read_mcd_xml_mmap(self._meta_fh) + else: + xml = McdUtils.read_mcd_xml(self._meta_fh) + # This is for mcd schemas, where the namespace are often messed up. + xml = xml.replace("diffgr:", "").replace("msdata:", "") + xml = xml.replace("\x00", "") + self.xml_str = xml + # remove namespace entry + xml = re.sub(r'\sxmlns="[^"]+"', '', xml, count=1) + self._xml = et.fromstring(xml) + self._ns = "{" + self._xml.tag.split("}")[0].strip("{") + "}" + + def parse_mcd_xml(self): + """ + Parse the mcd xml into a metadata object + """ + self.meta = McdXmlParser(self._xml, self._meta_fh.name) + + def get_acquisition_data(self, q: Acquisition): + # initialise data as 3D array with empty values + img = None + mcd_valid = txt_valid = False + # try to read data from mcd + try: + img = self._read_mcd_acquisition_data(q) + mcd_valid = True + except Exception as e: + pass + # if mcd is invalid try to read data from text + if not mcd_valid: + try: + img = self._read_txt_acquisition_data(q) + txt_valid = True + except Exception as e: + pass + # ToDo: if both sources are invalid try to read from mcd using different data size + + if mcd_valid: + source = 'mcd' + elif txt_valid: + source = 'txt' + else: + source = 'invalid' + img = np.zeros((1, 1, 1)) + q.meta_summary['q_data_source'] = source + return img + + def _read_mcd_acquisition_data(self, q: Acquisition): + data_size = q.data_size + data_nrows = q.data_nrows + if q.data_offset_start >= q.data_offset_end \ + or (q.data_offset_start + data_size) > self._mcd_fsize: + raise Exception('Invalid acquisition buffer size') + buffer = np.memmap( + self._mcd_fh, + dtype="<f", # little-endian + mode="r", + offset=q.data_offset_start, + shape=(int(data_size / q.value_bytes)), + ) + data = np.array([buffer[idx::q.n_channels] for idx in range(q.n_channels)]) + shape = [int(data[0].max()) + 1, int(data[1].max()) + 1] + if np.prod(shape) > data_nrows: + shape[1] -= 1 + q.meta_summary['q_width'] = shape[0] + q.meta_summary['q_height'] = shape[1] + data = data[:, :(np.prod(shape))] + data = np.reshape(data, [q.n_channels, shape[1], shape[0]], order='C') + return data + + def _read_txt_acquisition_data(self, q: Acquisition): + # look for available text files matching the acquisition ID + fn_end = '{}_{}.txt'.format(q.get_property(DESCRIPTION), q.id) + q.txt_fh = None + for fh in self._txt_fhs: + if fh.name.endswith(fn_end): + q.txt_fh = fh + break + if not q.txt_fh: + raise Exception('Acquisition has no text file') + elif not McdUtils.valid_txt_file(q.txt_fh): + raise Exception('Acquisition text file is empty') + data, shape, channel_names = McdUtils.read_acquisition_text_data(q.txt_fh) + q.meta_summary['q_width'] = shape[0] + q.meta_summary['q_height'] = shape[1] + return data + + def save_snapshot_image(self, obj, out_folder): + fn = [] + start_offset = [] + end_offset = [] + if isinstance(obj, Slide): + fn.append('Slide.jpg') + start_offset.append(int(obj.get_property(IMAGESTARTOFFSET)) + IMAGE_START_OFFSET) + end_offset.append(int(obj.get_property(IMAGEENDOFFSET))) + elif isinstance(obj, Panorama): + fn.append('Panorama_{}.png'.format(obj.id)) + start_offset.append(int(obj.get_property(IMAGESTARTOFFSET)) + IMAGE_START_OFFSET) + end_offset.append(int(obj.get_property(IMAGEENDOFFSET))) + elif isinstance(obj, Acquisition): + fn.append('Acquisition_{}_Before.png'.format(obj.id)) + start_offset.append(int(obj.get_property(BEFOREABLATIONIMAGESTARTOFFSET)) + IMAGE_START_OFFSET) + end_offset.append(int(obj.get_property(BEFOREABLATIONIMAGEENDOFFSET))) + fn.append('Acquisition_{}_After.png'.format(obj.id)) + start_offset.append(int(obj.get_property(AFTERABLATIONIMAGESTARTOFFSET)) + IMAGE_START_OFFSET) + end_offset.append(int(obj.get_property(AFTERABLATIONIMAGEENDOFFSET))) + if fn: + for i in range(len(fn)): + data_length = end_offset[i] - start_offset[i] + if data_length <= 0: + continue + fp = out_folder.joinpath(fn[i]) + if self._use_mmap: + buffer = McdUtils.read_mcd_buffer_mmap(self._mcd_fh, start_offset[i], data_length) + with open(fp, "wb") as f: + f.write(buffer) + + def close(self): + """Close file handles""" + try: + self._mcd_fh.close() + if self._mcd_fh != self._meta_fh: + self._meta_fh.close() + for tfh in self._txt_fhs: + tfh.close() + except: + pass diff --git a/imc2zarr/imclib/imcraw.py b/imc2zarr/imclib/imcraw.py new file mode 100644 index 0000000..1fd7535 --- /dev/null +++ b/imc2zarr/imclib/imcraw.py @@ -0,0 +1,171 @@ +import os +from pathlib import Path +import uuid +from collections import OrderedDict +from dateutil.parser import parse as dateparser + +from .imcdataparser import ImcDataParser +from .mcdmeta import Slide, Panorama, AcquisitionRoi, Acquisition +from .metadefs import * + +""" + ImcRaw: holds metadata about raw IMC scan and all of its components +""" + + +class ImcRaw: + + def __init__(self, input_dir, code=None): + self.input_dir = input_dir + self.code = code + self.mcd_fn = None + self.txt_fns = [] # text filenames + # parse files + self._find_files() + self._parse_files() + self._build_object_lists() + self._assign_imc_meta_summary() + + def _assign_imc_meta_summary(self): + # assign run timestamp from the first acquisition + self.timestamp = dateparser(self.acquisitions[0].get_property(STARTTIMESTAMP)) + # assign code + if not self.code: + self.code = self.timestamp.strftime('%Y%m%d-%H%M%S-%f') + self.rawmeta = self._parser.xml_str.replace("\n", "") + self.mcd_sw_version = self.slides[0].get_property(SWVERSION) + # set meta summary + self.meta_summary = OrderedDict() + self.meta_summary['description'] = self.slides[0].get_property(DESCRIPTION) + self.meta_summary['n_acquisitions'] = len(self.acquisitions) + self.meta_summary['mcd_sw_version'] = self.mcd_sw_version + self.meta_summary['run_date'] = self.acquisitions[0].get_property(STARTTIMESTAMP) + self.meta_summary['laser_power'] = self.acquisitions[0].get_property(ABLATIONPOWER) + # fill structure info + _acquisitions = [] + for q in self.acquisitions: + q_name = 'Q{}'.format(str(q.id).zfill(3)) + _acquisitions.append(q_name) + self.meta_summary['acquisitions'] = _acquisitions + _panoramas = [] + for p in self.slides[0].panoramas: + panorama = {'id': p.id, + SLIDEX1POSUM: p.get_property(SLIDEX1POSUM), + SLIDEX2POSUM: p.get_property(SLIDEX2POSUM), + SLIDEX3POSUM: p.get_property(SLIDEX3POSUM), + SLIDEX4POSUM: p.get_property(SLIDEX4POSUM), + SLIDEY1POSUM: p.get_property(SLIDEY1POSUM), + SLIDEY2POSUM: p.get_property(SLIDEY2POSUM), + SLIDEY3POSUM: p.get_property(SLIDEY3POSUM), + SLIDEY4POSUM: p.get_property(SLIDEY4POSUM), + PIXELSCALECOEF: p.get_property(PIXELSCALECOEF), + 'acquisition_rois': [] + } + for r in p.acquisitionrois: + acquisition_roi = {'id': r.id, + 'acquisitions': []} + for q in r.acquisitions: + acquisition = {'id': q.id, + 'channels': []} + for c in q.channels: + # format: marker -> target + acquisition['channels'].append({ + 'metal': c.get_property(CHANNELNAME), + 'target': c.get_property(CHANNELLABEL)}) + acquisition_roi['acquisitions'].append(acquisition) + panorama['acquisition_rois'].append(acquisition_roi) + _panoramas.append(panorama) + self.meta_summary['panoramas'] = _panoramas + + def _find_files(self): + # check whether the input_dir points to an mcd file + if self.input_dir.is_file(): + if self.input_dir.suffix != '.mcd': + raise Exception('Input file does not seem to be a valid mcd file') + self.mcd_fn = self.input_dir + else: + # check if mcd file exists + files = list(self.input_dir.glob('*.mcd')) + if not files: + raise Exception('No mcd file was found in the input folder') + elif len(files) > 1: + raise Exception('More than one mcd files were found in the input folder') + self.mcd_fn = files[0] + # text files + self.txt_fns = list(self.input_dir.glob('*.txt')) + + def _parse_files(self): + try: + self._parser = ImcDataParser(self.mcd_fn, textfilenames=self.txt_fns) + self._parser.read_mcd_xml() + self._parser.parse_mcd_xml() + except Exception as e: + raise Exception('Error parsing raw files: {}'.format(str(e))) + + def _build_object_lists(self): + self.slides = list(self._parser.meta.objects[SLIDE].values()) + self.panoramas = list(self._parser.meta.objects[PANORAMA].values()) + self.acquisitions = list(self._parser.meta.objects[ACQUISITION].values()) + for s in self.slides: + s.panoramas = list(s.childs[PANORAMA].values()) + for p in s.panoramas: + if len(p.childs): + p.acquisitionrois = list(p.childs[ACQUISITIONROI].values()) + for r in p.acquisitionrois: + r.roipoints = list(r.childs[ROIPOINT].values()) + # sort RoiPoints by OrderNumber + r.roipoints.sort(key=lambda x: x.get_property(ORDERNUMBER)) + r.acquisitions = list(r.childs[ACQUISITION].values()) + # sort Acquisitions by OrderNumber + r.acquisitions.sort(key=lambda x: x.get_property(ORDERNUMBER)) + for q in r.acquisitions: + q.channels = list(q.childs[ACQUISITIONCHANNEL].values()) + # sort Channels by OrderNumber + q.channels.sort(key=lambda x: x.get_property(ORDERNUMBER)) + self._assign_acquisition_meta_summary(q, r, p) + + def _assign_acquisition_meta_summary(self, q: Acquisition, r: AcquisitionRoi, p: Panorama): + # meta from acquisition + q.meta_summary['q_id'] = q.id + q.meta_summary['q_num'] = q.get_property(ORDERNUMBER) + q.meta_summary['q_timestamp'] = q.get_property(STARTTIMESTAMP) + q.meta_summary['q_description'] = q.get_property(DESCRIPTION) + q.meta_summary['q_maxx'] = q.maxx + q.meta_summary['q_maxy'] = q.maxy + q.meta_summary['q_stage_x'] = float(r.roipoints[0].get_property(SLIDEXPOSUM)) + q.meta_summary['q_stage_y'] = float(r.roipoints[0].get_property(SLIDEYPOSUM)) + q.meta_summary['q_laser_power'] = q.get_property(ABLATIONPOWER) + q.meta_summary['q_resolution_xy'] = 1.0 # always 1 um + q.meta_summary['q_n_channels'] = len(q.channels) + channel_meta = [] + for c in q.channels: + # format: marker -> target + channel_meta.append({ + 'metal': c.get_property(CHANNELNAME), + 'target': c.get_property(CHANNELLABEL)}) + q.meta_summary['q_channels'] = channel_meta + # meta from panorama + q.meta_summary['p_id'] = p.id + # meta from acquisition_roi + q.meta_summary['r_id'] = r.id + + def get_acquisition_data(self, q: Acquisition): + return self._parser.get_acquisition_data(q) + + def close(self): + self._parser.close() + + def save_snapshot_images(self, out_folder): + out_folder.mkdir(parents=True, exist_ok=True) + for s in self.slides: + self._parser.save_snapshot_image(s, out_folder) + for p in self.panoramas: + self._parser.save_snapshot_image(p, out_folder) + for q in self.acquisitions: + self._parser.save_snapshot_image(q, out_folder) + + def save_meta_xml(self, out_folder): + fn = "mcd_schema.xml" + fn = out_folder.joinpath(fn) + with open(fn, "w") as f: + f.write(self.rawmeta) diff --git a/imc2zarr/imclib/mcdmeta.py b/imc2zarr/imclib/mcdmeta.py new file mode 100644 index 0000000..7e0732f --- /dev/null +++ b/imc2zarr/imclib/mcdmeta.py @@ -0,0 +1,213 @@ +from collections import OrderedDict +import os +import csv + +from .metadefs import * + +""" +This module should help parsing the MCD xml metadata +""" +PARSER = "parser" +META_CSV = "_meta.csv" +""" +Definition of all the meta objects +Each entity will have a class corresponding to it, with helpermethods +that e.g. allow to retrieve images etc. + +This is implemented as parent-child relationships where each entry has a list of parents +and a nested dictionary of children of the form (child_type: childID: childobject) + +Further each object is registered in the global root node, making them easy accessible. +""" + + +class Meta(object): + """ + Represents an abstract metadata object. + """ + + def __init__(self, mtype, meta, parents, symbol=None): + """ + Initializes the metadata object, generates the + parent-child relationships and updates to object list + of the root + + :param mtype: the name of the object type + :param meta: the metadata dictionary + :param parents: the parents of this object + :param symbol: the short symbol for this metadata, e.g. 's' for slide + + """ + self.mtype = mtype + self.id = meta.get(ID, None) + # if self.id: + # self.id = int(self.id) + self.childs = dict() + self.symbol = symbol + + self.properties = meta + self.parents = parents + for p in parents: + self._update_parents(p) + + if self.is_root: + self.objects = dict() + else: + # update the root objects + root = self.get_root() + self._update_dict(root.objects) + + @property + def is_root(self): + return len(self.parents) == 0 + + def _update_parents(self, p): + self._update_dict(p.childs) + + def _update_dict(self, d): + mtype = self.mtype + mdict = d.get(mtype, None) + if mdict is None: + mdict = OrderedDict() + d[mtype] = mdict + mdict.update({self.id: self}) + + def get_root(self): + """ + Gets the root node of this metadata + tree + """ + if self.is_root: + return self + else: + return self.parents[0].get_root() + + @property + def metaname(self): + pname = self.parents[0].metaname + return "_".join([pname, self.symbol + self.id]) + + def get_property(self, prop, default_val=None): + if prop in self.properties: + val = self.properties[prop] + else: + val = default_val + return val + + +# Definition of the subclasses +class Slide(Meta): + def __init__(self, meta, parents): + Meta.__init__(self, SLIDE, meta, parents, "s") + + +class Panorama(Meta): + def __init__(self, meta, parents): + self.acquisitionrois = [] + Meta.__init__(self, PANORAMA, meta, parents, "p") + + +class AcquisitionRoi(Meta): + def __init__(self, meta, parents): + self.roipoints = [] + Meta.__init__(self, ACQUISITIONROI, meta, parents, "r") + + +class Acquisition(Meta): + def __init__(self, meta, parents): + self.meta_summary = OrderedDict() + self.channels = [] + Meta.__init__(self, ACQUISITION, meta, parents, "a") + + def get_channels(self): + return self.childs[ACQUISITIONCHANNEL] + + def get_channel_orderdict(self): + chan_dic = self.get_channels() + out_dic = dict() + for k, chan in chan_dic.items(): + channel_name = chan.properties[CHANNELNAME] + channel_label = chan.properties.get(CHANNELLABEL, channel_name) + channel_order = int(chan.properties.get(ORDERNUMBER)) + out_dic.update({channel_order: (channel_name, channel_label)}) + return out_dic + + @property + def data_offset_start(self): + return int(self.properties[DATASTARTOFFSET]) + + @property + def data_offset_end(self): + return int(self.properties[DATAENDOFFSET]) + + @property + def data_size(self): + return self.data_offset_end - self.data_offset_start #+ 1 + + @property + def data_nrows(self): + nrow = int( + self.data_size / (self.n_channels * int(self.properties[VALUEBYTES])) + ) + return nrow + + @property + def maxx(self): + return int(self.get_property(MAXX)) + + @property + def maxy(self): + return int(self.get_property(MAXY)) + + @property + def n_channels(self): + return len(self.get_channels()) + + @property + def value_bytes(self): + return abs(int(self.properties[VALUEBYTES])) + + @property + def expected_data_size(self): + return self.n_channels * self.maxx * self.maxy * self.value_bytes + + @property + def expected_data_nrows(self): + nrow = int( + self.expected_data_size / (self.n_channels * int(self.properties[VALUEBYTES])) + ) + return nrow + + +class RoiPoint(Meta): + def __init__(self, meta, parents): + Meta.__init__(self, ROIPOINT, meta, parents, "rp") + + +class Channel(Meta): + def __init__(self, meta, parents): + Meta.__init__(self, ACQUISITIONCHANNEL, meta, parents, "c") + + +# A dictionary to map metadata keys to metadata types +# The order reflects the dependency structure of them and the +# order these objects should be initialized +OBJ_DICT = OrderedDict( + [ + (SLIDE, Slide), + (PANORAMA, Panorama), + (ACQUISITIONROI, AcquisitionRoi), + (ACQUISITION, Acquisition), + (ROIPOINT, RoiPoint), + (ACQUISITIONCHANNEL, Channel), + ] +) + +# A dictionary to map id keys to metadata keys +# Used for initializaiton of the objects +ID_DICT = { + SLIDEID: SLIDE, + PANORAMAID: PANORAMA, + ACQUISITIONROIID: ACQUISITIONROI, + ACQUISITIONID: ACQUISITION, +} \ No newline at end of file diff --git a/imc2zarr/imclib/mcdutils.py b/imc2zarr/imclib/mcdutils.py new file mode 100644 index 0000000..0aca191 --- /dev/null +++ b/imc2zarr/imclib/mcdutils.py @@ -0,0 +1,272 @@ +import mmap +import re, os +import binascii +from collections import defaultdict +import array +import pandas as pd +import numpy as np + +from .metadefs import MCDSCHEMA, ID, ORDERNUMBER + + +class McdUtils: + """Static method helpers for parsing MCD file format""" + _start_str="<MCDSchema" + _stop_str="</MCDSchema>" + _meta_length = 100 * 1024 ** 2 + + @staticmethod + def read_mcd_xml_mmap(fh): + """ + Finds the MCD metadata XML in the binary and updates the mcdparser object. + As suggested in the specifications the file is parsed from the end. + + :param fn: + :param start_str: + :param stop_str: + """ + size = os.fstat(fh.fileno()).st_size + length = McdUtils._meta_length if McdUtils._meta_length < size else size + offset = size - length + map_start = offset - offset % mmap.ALLOCATIONGRANULARITY + mm = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ, offset=map_start) + + start_str = McdUtils._start_str + stop_str = McdUtils._stop_str + + xml_start = mm.rfind(start_str.encode("utf-8")) + + if xml_start == -1: + start_str = McdUtils._add_nullbytes(start_str) + xml_start = mm.rfind(start_str.encode("utf-8")) + + if xml_start == -1: + raise ValueError( + "Invalid MCD: MCD xml start tag not found in file %s" % fh.name + ) + else: + xml_stop = mm.rfind(stop_str.encode("utf-8")) + if xml_stop == -1: + stop_str = McdUtils._add_nullbytes(stop_str) + xml_stop = mm.rfind(stop_str.encode("utf-8")) + # xmls = [mm[start:end] for start, end in zip(xml_starts, xml_stops)] + + if xml_stop == -1: + raise ValueError( + "Invalid MCD: MCD xml stop tag not found in file %s" % fh.name + ) + else: + xml_stop += len(stop_str) + + xml = mm[xml_start:xml_stop].decode("utf-8") + return xml + + @staticmethod + def read_mcd_xml(fh): + """ + Finds the MCD metadata XML in the binary. + As suggested in the specifications the file is parsed from the end. + + :param fn: + :param start_str: + :param stop_str: + """ + start_str = McdUtils._start_str + stop_str = McdUtils._stop_str + + xml_start = McdUtils._reverse_find_in_buffer(fh, start_str.encode("utf-8")) + + if xml_start == -1: + start_str = McdUtils._add_nullbytes(start_str) + xml_start = McdUtils._reverse_find_in_buffer(fh, start_str.encode("utf-8")) + + if xml_start == -1: + raise ValueError( + "Invalid MCD: MCD xml start tag not found in file %s" % fh.name + ) + else: + xml_stop = McdUtils._reverse_find_in_buffer(fh, stop_str.encode("utf-8")) + if xml_stop == -1: + stop_str = McdUtils._add_nullbytes(stop_str) + xml_stop = McdUtils._reverse_find_in_buffer(fh, stop_str.encode("utf-8")) + # xmls = [mm[start:end] for start, end in zip(xml_starts, xml_stops)] + + if xml_stop == -1: + raise ValueError( + "Invalid MCD: MCD xml stop tag not found in file %s" % fh.name + ) + else: + xml_stop += len(stop_str) + + fh.seek(xml_start) + xml = fh.read(xml_stop - xml_start).decode("utf-8") + return xml + + @staticmethod + def read_mcd_buffer_mmap(fh, start_offset, length): + mm = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ) + mm.seek(start_offset) + return mm.read(length) + + @staticmethod + def get_shape_from_acq_data(data): + shape = data[:, :2].max(axis=0) + 1 + # if np.prod(shape) > data.shape[0]: + # shape[1] -= 1 + shape = shape.astype(int) + return shape + + @staticmethod + def valid_txt_file(fh, valid_lines=2): + fh.seek(0) + valid = False + l = 0 + for line in fh: + l += 1 + if l >= valid_lines: + valid = True + break + return valid + + @staticmethod + def read_acquisition_text_data(fh, first_col=3): + fh.seek(0) + header = fh.readline().split("\t") + channel_names = header[first_col:] + nchan = len(channel_names) + rawar = array.array("f") + for raw in fh: + for v in raw.split("\t")[first_col:]: + rawar.append(float(v)) + nrow = int(len(rawar) / nchan) + data = np.array([rawar[idx::nchan] for idx in range(nchan)]) + shape = [int(data[0].max()) + 1, int(data[1].max()) + 1] + if np.prod(shape) > nrow: + shape[1] -= 1 + data = data[:, :(np.prod(shape))] + data = np.reshape(data, [nchan, shape[1], shape[0]], order='C') + return data, shape, channel_names + + @staticmethod + def read_acquisition_text_data_2(fh, first_col=3): + fh.seek(0) + header = fh.readline().split("\t") + channel_names = header[first_col:] + nchan = len(channel_names) + data = [] + for c_idx in range(first_col, nchan + first_col): + ch_data = McdUtils._read_channel_text_data_pd(fh, c_idx) + data.append(ch_data) + shape = [int(data[0].max()) + 1, int(data[1].max()) + 1] + if np.prod(shape) > data[0].shape[0]: + shape[1] -= 1 + data = np.hstack(data) + data = data[:, :(np.prod(shape))] + data = np.reshape(data, [nchan, shape[1], shape[0]], order='C') + return data, shape, channel_names + + @staticmethod + def _read_channel_text_data_pd(fh, ch_col): + fh.seek(0) + ch_data = pd.read_table( + fh, + dtype='f', + engine="c", + skiprows=0, + usecols=[ch_col], + ) + return ch_data + + @staticmethod + def _read_channel_text_data_np(fh, ch_col): + fh.seek(0) + ch_data = np.genfromtxt( + fh, + dtype='f', + delimiter='\t', + skip_header=1, + usecols=[ch_col], + ) + return ch_data + + @staticmethod + def _reverse_find_in_buffer(f, s, buffer_size=8192): + """ + Find 's' in buffer of file-handle 'f' + + :return: string with nullbits + """ + # based on http://stackoverflow.com/questions/3893885/cheap-way-to-search-a-large-text-file-for-a-string + f.seek(0, 2) + + buf = None + overlap = len(s) - 1 + bsize = buffer_size + overlap + 1 + cur_pos = f.tell() - bsize + 1 + offset = -2 * bsize + overlap + first_start = True + while cur_pos >= 0: + #print('seeking..') + f.seek(cur_pos) + buf = f.read(bsize) + if buf: + pos = buf.find(s) + if pos >= 0: + return f.tell() - (len(buf) - pos) + + cur_pos = f.tell() + offset + if (cur_pos < 0) and first_start: + first_start = False + cur_pos = 0 + return -1 + + @staticmethod + def _add_nullbytes(buffer_str): + """ + Adds nullbytes after each character in a string + + :param buffer_str: + :return: string with nullbits + """ + pad_str = "" + for s in buffer_str: + pad_str += s + "\x00" + return pad_str + + @staticmethod + def _etree_to_dict(t): + """ + converts an etree xml to a dictionary + """ + d = {t.tag: {} if t.attrib else None} + children = list(t) + if children: + dd = defaultdict(list) + for dc in map(McdUtils._etree_to_dict, children): + for k, v in dc.items(): + dd[k].append(v) + d = { + t.tag: { + k: v[0] if (len(v) == 1 and ~isinstance(v[0], type(dict()))) else v + for k, v in dd.items() + } + } + if t.attrib: + d[t.tag].update(("@" + k, v) for k, v in t.attrib.items()) + if t.text: + text = t.text.strip() + if children or t.attrib: + if text: + d[t.tag]["#text"] = text + else: + if t.tag == ID or t.tag == ORDERNUMBER: + d[t.tag] = int(text) + else: + d[t.tag] = text + return d + + @staticmethod + def xml2dict(xml): + dic = McdUtils._etree_to_dict(xml) + dic = dic[MCDSCHEMA] + return dic \ No newline at end of file diff --git a/imc2zarr/imclib/mcdxmlparser.py b/imc2zarr/imclib/mcdxmlparser.py new file mode 100644 index 0000000..afcd6b1 --- /dev/null +++ b/imc2zarr/imclib/mcdxmlparser.py @@ -0,0 +1,114 @@ +import xml.etree.ElementTree as et + +from .mcdmeta import * +from .mcdutils import McdUtils + + +class McdXmlParser(Meta): + """ + Represents the full mcd xml + """ + + def __init__(self, xml, filename=None): + self._rawxml = xml + meta = McdUtils.xml2dict(xml) + Meta.__init__(self, MCDSCHEMA, meta, []) + self._init_objects() + if filename is None: + filename = list(self.childs[SLIDE].values())[0].properties[FILENAME] + self.filename = filename + + @property + def metaname(self): + mcd_fn = self.filename + mcd_fn = mcd_fn.replace("\\", "/") + mcd_fn = os.path.split(mcd_fn)[1].rstrip("_schema.xml") + mcd_fn = os.path.splitext(mcd_fn)[0] + return mcd_fn + + def _init_objects(self): + obj_keys = [k for k in OBJ_DICT.keys() if k in self.properties.keys()] + for k in obj_keys: + ObjClass = OBJ_DICT[k] + objs = self._get_meta_objects(k) + idks = [ik for ik in objs[0].keys() if ik in ID_DICT.keys()] + for o in objs: + parents = [self._get_objects_by_id(ik, o[ik]) for ik in idks] + if len(parents) == 0: + parents = [self] + ObjClass(o, parents) + + def _get_objects_by_id(self, idname, objid): + """ + Gets objects by idname and id + :param idname: an name of an id registered in the ID_DICT + :param objid: the id of the object + :returns: the described object. + """ + mtype = ID_DICT[idname] + return self._get_object(mtype, int(objid)) + + def _get_object(self, mtype, mid): + """ + Return an object defined by type and id + :param mtype: object type + :param mid: object id + :returns: the requested object + """ + return self.objects[mtype][mid] + + def _get_meta_objects(self, mtype): + """ + A helper to get objects, e.g. slides etc. metadata + from the metadata dict. takes care of the case where + only one object is present and thus a dict and not a + list of dicts is returned. + """ + objs = self.properties.get(mtype) + if isinstance(objs, type(dict())): + objs = [objs] + return objs + + def save_meta_xml(self, out_folder): + xml = self._rawxml + # fn = self.metaname + "_schema.xml" + fn = "mcd_schema.xml" + et.ElementTree(xml).write( + os.path.join(out_folder, fn), encoding="utf-8" + ) + + def get_channels(self): + """ + gets a list of all channels + """ + raise NotImplementedError + + def get_acquisitions(self): + """ + gets a list of all acquisitions + """ + return self.objects[ACQUISITION] + + def get_acquisition_meta(self, acid): + """ + Returns the acquisition metadata dict + """ + return self._get_object(ACQUISITION, acid).properties + + def get_acquisition_rois(self): + """ + gets a list of all acuisitionROIs + """ + raise NotImplementedError + + def get_panoramas(self): + """ + get a list of all panoramas + """ + raise NotImplementedError + + def get_roipoints(self): + """ + get a list of all roipoints + """ + raise NotImplementedError diff --git a/imc2zarr/imclib/metadefs.py b/imc2zarr/imclib/metadefs.py new file mode 100644 index 0000000..fd91fe4 --- /dev/null +++ b/imc2zarr/imclib/metadefs.py @@ -0,0 +1,71 @@ +IMAGE_START_OFFSET = 161 + +# Definition of all the vocabulary used +ABLATIONDISTANCEBETWEENSHOTSX = "AblationDistanceBetweenShotsX" +ABLATIONDISTANCEBETWEENSHOTSY = "AblationDistanceBetweenShotsY" +ABLATIONFREQUENCY = "AblationFrequency" +ABLATIONPOWER = "AblationPower" +ACQUISITION = "Acquisition" +ACQUISITIONCHANNEL = "AcquisitionChannel" +ACQUISITIONID = "AcquisitionID" +ACQUISITIONROI = "AcquisitionROI" +ACQUISITIONROIID = "AcquisitionROIID" +AFTERABLATIONIMAGEENDOFFSET = "AfterAblationImageEndOffset" +AFTERABLATIONIMAGESTARTOFFSET = "AfterAblationImageStartOffset" +BEFOREABLATIONIMAGEENDOFFSET = "BeforeAblationImageEndOffset" +BEFOREABLATIONIMAGESTARTOFFSET = "BeforeAblationImageStartOffset" +CHANNELLABEL = "ChannelLabel" +CHANNELNAME = "ChannelName" +DATAENDOFFSET = "DataEndOffset" +DATASTARTOFFSET = "DataStartOffset" +DESCRIPTION = "Description" +DUALCOUNTSTART = "DualCountStart" +ENDTIMESTAMP = "EndTimeStamp" +FILENAME = "Filename" +HEIGHTUM = "HeightUm" +ID = "ID" +IMAGEENDOFFSET = "ImageEndOffset" +IMAGEFILE = "ImageFile" +IMAGEFORMAT = "ImageFormat" +IMAGESTARTOFFSET = "ImageStartOffset" +MCDSCHEMA = "MCDSchema" +MAXX = "MaxX" +MAXY = "MaxY" +MOVEMENTTYPE = "MovementType" +ORDERNUMBER = "OrderNumber" +PANORAMA = "Panorama" +PANORAMAID = "PanoramaID" +PANORAMAPIXELXPOS = "PanoramaPixelXPos" +PANORAMAPIXELYPOS = "PanoramaPixelYPos" +PIXELHEIGHT = "PixelHeight" +PIXELSCALECOEF = "PixelScaleCoef" +PIXELWIDTH = "PixelWidth" +PLUMEEND = "PlumeEnd" +PLUMESTART = "PlumeStart" +ROIENDXPOSUM = "ROIEndXPosUm" +ROIENDYPOSUM = "ROIEndYPosUm" +ROIPOINT = "ROIPoint" +ROISTARTXPOSUM = "ROIStartXPosUm" +ROISTARTYPOSUM = "ROIStartYPosUm" +ROITYPE = "ROIType" +SEGMENTDATAFORMAT = "SegmentDataFormat" +SIGNALTYPE = "SignalType" +SLIDE = "Slide" +SLIDEID = "SlideID" +SLIDETYPE = "SlideType" +SLIDEX1POSUM = "SlideX1PosUm" +SLIDEX2POSUM = "SlideX2PosUm" +SLIDEX3POSUM = "SlideX3PosUm" +SLIDEX4POSUM = "SlideX4PosUm" +SLIDEXPOSUM = "SlideXPosUm" +SLIDEY1POSUM = "SlideY1PosUm" +SLIDEY2POSUM = "SlideY2PosUm" +SLIDEY3POSUM = "SlideY3PosUm" +SLIDEY4POSUM = "SlideY4PosUm" +SLIDEYPOSUM = "SlideYPosUm" +STARTTIMESTAMP = "StartTimeStamp" +TEMPLATE = "Template" +UID = "UID" +VALUEBYTES = "ValueBytes" +WIDTHUM = "WidthUm" +SWVERSION = "SwVersion" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8fe2f47 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3c01dd2 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,40 @@ +[metadata] +name = imc2zarr +version = attr: imc2zarr.__version__ +author = Mo Alsad and Eduardo Gonzalez Solares +author_email ="msa51@cam.ac.uk +description = IMC scan to Zarr conversion +long_description = file: README.md +long_description_content_type = text/markdown +url = https://gitlab.developers.cam.ac.uk/astronomy/camcead/imaxt/imc2zarr +project_urls = + Bug Tracker = https://gitlab.developers.cam.ac.uk/astronomy/camcead/imaxt/imc2zarr/-/issues +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent + +[options] +packages = find: +python_requires = >=3.7 +setup_requires = + setuptools + +install_requires = + click + numpy + pandas + python_dateutil + xarray + zarr + +[options.entry_points] +console_scripts = + imc2zarr = imc2zarr:main + +[flake8] +max-line-length = 110 +select = C,E,F,W,B,B950 +exclude = docs,build,.git,__pycache__ +ignore = E203, E252, E501, W503, W504, B950 +max-complexity = 10 diff --git a/test.py b/test.py new file mode 100644 index 0000000..604aa6d --- /dev/null +++ b/test.py @@ -0,0 +1,4 @@ +from imc2zarr import imc2zarr + +imc2zarr(r'D:\test\imaxt\imc\in\v7', r'D:\test\imaxt\imc\out\test') + -- GitLab From 3dbeb7c4be410e453c00a572b8cd10b43db57dde Mon Sep 17 00:00:00 2001 From: Mo Al Sa'd <msa51@cam.ac.uk> Date: Sun, 19 Jun 2022 23:42:32 +0000 Subject: [PATCH 2/4] Delete test.py --- test.py | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 604aa6d..0000000 --- a/test.py +++ /dev/null @@ -1,4 +0,0 @@ -from imc2zarr import imc2zarr - -imc2zarr(r'D:\test\imaxt\imc\in\v7', r'D:\test\imaxt\imc\out\test') - -- GitLab From a18ec2ddf22a002052cd03ac786f9c3705b80d59 Mon Sep 17 00:00:00 2001 From: Eduardo Gonzalez Solares <eglez@ast.cam.ac.uk> Date: Mon, 20 Jun 2022 13:04:33 +0100 Subject: [PATCH 3/4] Add flake8 testing --- .gitlab-ci.yml | 15 +++++++++++++++ pyproject.toml | 3 +++ setup.cfg | 15 ++++++++++++++- 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 .gitlab-ci.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..4c86f30 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,15 @@ +image: python:latest + +variables: + PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip" + +cache: + paths: + - .cache/pip + - .tox + +test: + script: + - pip install tox + - tox + diff --git a/pyproject.toml b/pyproject.toml index 8fe2f47..17f471b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,6 @@ [build-system] requires = ["setuptools>=42", "wheel"] build-backend = "setuptools.build_meta" + +[tox] +isolated_build = true diff --git a/setup.cfg b/setup.cfg index 3c01dd2..26c35ea 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,6 +35,19 @@ console_scripts = [flake8] max-line-length = 110 select = C,E,F,W,B,B950 -exclude = docs,build,.git,__pycache__ +exclude = docs,build,.git,__pycache__,.tox ignore = E203, E252, E501, W503, W504, B950 max-complexity = 10 + + +[tox:tox] +envlist = py39 +isolated_build = True + +[testenv] +usedevelop = true +install_command = pip install -U {opts} {packages} +deps = + flake8 +commands = + flake8 -- GitLab From dc85f8dd62d7e23ce45d53536320b257b2be5b3e Mon Sep 17 00:00:00 2001 From: Mo Alsad <mo.alsad@gmail.com> Date: Fri, 29 Jul 2022 20:28:15 +0100 Subject: [PATCH 4/4] Fix coding style --- imc2zarr/converter.py | 2 +- imc2zarr/imclib/imcdataparser.py | 38 +++++++-------- imc2zarr/imclib/imcraw.py | 79 +++++++++++++++----------------- imc2zarr/imclib/mcdmeta.py | 64 +++++++++++++------------- imc2zarr/imclib/mcdutils.py | 22 ++++----- imc2zarr/imclib/mcdxmlparser.py | 12 +++-- imc2zarr/imclib/metadefs.py | 2 +- 7 files changed, 108 insertions(+), 111 deletions(-) diff --git a/imc2zarr/converter.py b/imc2zarr/converter.py index 4774cb5..9c07e67 100644 --- a/imc2zarr/converter.py +++ b/imc2zarr/converter.py @@ -61,4 +61,4 @@ class Imc2Zarr: imc.save_meta_xml(self.output_fn) # save snapshots snapshot_dir = self.output_fn.joinpath('snapshots') - imc.save_snapshot_images(snapshot_dir) \ No newline at end of file + imc.save_snapshot_images(snapshot_dir) diff --git a/imc2zarr/imclib/imcdataparser.py b/imc2zarr/imclib/imcdataparser.py index 2e348b0..05b6594 100644 --- a/imc2zarr/imclib/imcdataparser.py +++ b/imc2zarr/imclib/imcdataparser.py @@ -1,14 +1,14 @@ -import re, os -from pathlib import Path +import os +import re import xml.etree.ElementTree as et -import time +from pathlib import Path import numpy as np +from .mcdmeta import Slide, Panorama, Acquisition from .mcdutils import McdUtils from .mcdxmlparser import McdXmlParser -from .mcdmeta import Slide, Panorama, Acquisition -from .metadefs import * +import metadefs as defs class ImcDataParser: @@ -62,14 +62,14 @@ class ImcDataParser: try: img = self._read_mcd_acquisition_data(q) mcd_valid = True - except Exception as e: + except Exception: pass # if mcd is invalid try to read data from text if not mcd_valid: try: - img = self._read_txt_acquisition_data(q) + img = self._read_txt_acquisition_data(q) txt_valid = True - except Exception as e: + except Exception: pass # ToDo: if both sources are invalid try to read from mcd using different data size @@ -91,7 +91,7 @@ class ImcDataParser: raise Exception('Invalid acquisition buffer size') buffer = np.memmap( self._mcd_fh, - dtype="<f", # little-endian + dtype="<f", # little-endian mode="r", offset=q.data_offset_start, shape=(int(data_size / q.value_bytes)), @@ -108,7 +108,7 @@ class ImcDataParser: def _read_txt_acquisition_data(self, q: Acquisition): # look for available text files matching the acquisition ID - fn_end = '{}_{}.txt'.format(q.get_property(DESCRIPTION), q.id) + fn_end = '{}_{}.txt'.format(q.get_property(defs.DESCRIPTION), q.id) q.txt_fh = None for fh in self._txt_fhs: if fh.name.endswith(fn_end): @@ -129,19 +129,19 @@ class ImcDataParser: end_offset = [] if isinstance(obj, Slide): fn.append('Slide.jpg') - start_offset.append(int(obj.get_property(IMAGESTARTOFFSET)) + IMAGE_START_OFFSET) - end_offset.append(int(obj.get_property(IMAGEENDOFFSET))) + start_offset.append(int(obj.get_property(defs.IMAGESTARTOFFSET)) + defs.IMAGE_START_OFFSET) + end_offset.append(int(obj.get_property(defs.IMAGEENDOFFSET))) elif isinstance(obj, Panorama): fn.append('Panorama_{}.png'.format(obj.id)) - start_offset.append(int(obj.get_property(IMAGESTARTOFFSET)) + IMAGE_START_OFFSET) - end_offset.append(int(obj.get_property(IMAGEENDOFFSET))) + start_offset.append(int(obj.get_property(defs.IMAGESTARTOFFSET)) + defs.IMAGE_START_OFFSET) + end_offset.append(int(obj.get_property(defs.IMAGEENDOFFSET))) elif isinstance(obj, Acquisition): fn.append('Acquisition_{}_Before.png'.format(obj.id)) - start_offset.append(int(obj.get_property(BEFOREABLATIONIMAGESTARTOFFSET)) + IMAGE_START_OFFSET) - end_offset.append(int(obj.get_property(BEFOREABLATIONIMAGEENDOFFSET))) + start_offset.append(int(obj.get_property(defs.BEFOREABLATIONIMAGESTARTOFFSET)) + defs.IMAGE_START_OFFSET) + end_offset.append(int(obj.get_property(defs.BEFOREABLATIONIMAGEENDOFFSET))) fn.append('Acquisition_{}_After.png'.format(obj.id)) - start_offset.append(int(obj.get_property(AFTERABLATIONIMAGESTARTOFFSET)) + IMAGE_START_OFFSET) - end_offset.append(int(obj.get_property(AFTERABLATIONIMAGEENDOFFSET))) + start_offset.append(int(obj.get_property(defs.AFTERABLATIONIMAGESTARTOFFSET)) + defs.IMAGE_START_OFFSET) + end_offset.append(int(obj.get_property(defs.AFTERABLATIONIMAGEENDOFFSET))) if fn: for i in range(len(fn)): data_length = end_offset[i] - start_offset[i] @@ -161,5 +161,5 @@ class ImcDataParser: self._meta_fh.close() for tfh in self._txt_fhs: tfh.close() - except: + except Exception: pass diff --git a/imc2zarr/imclib/imcraw.py b/imc2zarr/imclib/imcraw.py index 1fd7535..bb68b40 100644 --- a/imc2zarr/imclib/imcraw.py +++ b/imc2zarr/imclib/imcraw.py @@ -1,15 +1,12 @@ -import os -from pathlib import Path -import uuid from collections import OrderedDict from dateutil.parser import parse as dateparser from .imcdataparser import ImcDataParser -from .mcdmeta import Slide, Panorama, AcquisitionRoi, Acquisition -from .metadefs import * +from .mcdmeta import Panorama, AcquisitionRoi, Acquisition +import metadefs as defs """ - ImcRaw: holds metadata about raw IMC scan and all of its components + ImcRaw: holds metadata about raw IMC scan and all of its components """ @@ -28,19 +25,19 @@ class ImcRaw: def _assign_imc_meta_summary(self): # assign run timestamp from the first acquisition - self.timestamp = dateparser(self.acquisitions[0].get_property(STARTTIMESTAMP)) + self.timestamp = dateparser(self.acquisitions[0].get_property(defs.STARTTIMESTAMP)) # assign code if not self.code: self.code = self.timestamp.strftime('%Y%m%d-%H%M%S-%f') self.rawmeta = self._parser.xml_str.replace("\n", "") - self.mcd_sw_version = self.slides[0].get_property(SWVERSION) + self.mcd_sw_version = self.slides[0].get_property(defs.SWVERSION) # set meta summary self.meta_summary = OrderedDict() - self.meta_summary['description'] = self.slides[0].get_property(DESCRIPTION) + self.meta_summary['description'] = self.slides[0].get_property(defs.DESCRIPTION) self.meta_summary['n_acquisitions'] = len(self.acquisitions) self.meta_summary['mcd_sw_version'] = self.mcd_sw_version - self.meta_summary['run_date'] = self.acquisitions[0].get_property(STARTTIMESTAMP) - self.meta_summary['laser_power'] = self.acquisitions[0].get_property(ABLATIONPOWER) + self.meta_summary['run_date'] = self.acquisitions[0].get_property(defs.STARTTIMESTAMP) + self.meta_summary['laser_power'] = self.acquisitions[0].get_property(defs.ABLATIONPOWER) # fill structure info _acquisitions = [] for q in self.acquisitions: @@ -50,15 +47,15 @@ class ImcRaw: _panoramas = [] for p in self.slides[0].panoramas: panorama = {'id': p.id, - SLIDEX1POSUM: p.get_property(SLIDEX1POSUM), - SLIDEX2POSUM: p.get_property(SLIDEX2POSUM), - SLIDEX3POSUM: p.get_property(SLIDEX3POSUM), - SLIDEX4POSUM: p.get_property(SLIDEX4POSUM), - SLIDEY1POSUM: p.get_property(SLIDEY1POSUM), - SLIDEY2POSUM: p.get_property(SLIDEY2POSUM), - SLIDEY3POSUM: p.get_property(SLIDEY3POSUM), - SLIDEY4POSUM: p.get_property(SLIDEY4POSUM), - PIXELSCALECOEF: p.get_property(PIXELSCALECOEF), + defs.SLIDEX1POSUM: p.get_property(defs.SLIDEX1POSUM), + defs.SLIDEX2POSUM: p.get_property(defs.SLIDEX2POSUM), + defs.SLIDEX3POSUM: p.get_property(defs.SLIDEX3POSUM), + defs.SLIDEX4POSUM: p.get_property(defs.SLIDEX4POSUM), + defs.SLIDEY1POSUM: p.get_property(defs.SLIDEY1POSUM), + defs.SLIDEY2POSUM: p.get_property(defs.SLIDEY2POSUM), + defs.SLIDEY3POSUM: p.get_property(defs.SLIDEY3POSUM), + defs.SLIDEY4POSUM: p.get_property(defs.SLIDEY4POSUM), + defs.PIXELSCALECOEF: p.get_property(defs.PIXELSCALECOEF), 'acquisition_rois': [] } for r in p.acquisitionrois: @@ -70,8 +67,8 @@ class ImcRaw: for c in q.channels: # format: marker -> target acquisition['channels'].append({ - 'metal': c.get_property(CHANNELNAME), - 'target': c.get_property(CHANNELLABEL)}) + 'metal': c.get_property(defs.CHANNELNAME), + 'target': c.get_property(defs.CHANNELLABEL)}) acquisition_roi['acquisitions'].append(acquisition) panorama['acquisition_rois'].append(acquisition_roi) _panoramas.append(panorama) @@ -103,46 +100,46 @@ class ImcRaw: raise Exception('Error parsing raw files: {}'.format(str(e))) def _build_object_lists(self): - self.slides = list(self._parser.meta.objects[SLIDE].values()) - self.panoramas = list(self._parser.meta.objects[PANORAMA].values()) - self.acquisitions = list(self._parser.meta.objects[ACQUISITION].values()) + self.slides = list(self._parser.meta.objects[defs.SLIDE].values()) + self.panoramas = list(self._parser.meta.objects[defs.PANORAMA].values()) + self.acquisitions = list(self._parser.meta.objects[defs.ACQUISITION].values()) for s in self.slides: - s.panoramas = list(s.childs[PANORAMA].values()) + s.panoramas = list(s.childs[defs.PANORAMA].values()) for p in s.panoramas: if len(p.childs): - p.acquisitionrois = list(p.childs[ACQUISITIONROI].values()) + p.acquisitionrois = list(p.childs[defs.ACQUISITIONROI].values()) for r in p.acquisitionrois: - r.roipoints = list(r.childs[ROIPOINT].values()) + r.roipoints = list(r.childs[defs.ROIPOINT].values()) # sort RoiPoints by OrderNumber - r.roipoints.sort(key=lambda x: x.get_property(ORDERNUMBER)) - r.acquisitions = list(r.childs[ACQUISITION].values()) + r.roipoints.sort(key=lambda x: x.get_property(defs.ORDERNUMBER)) + r.acquisitions = list(r.childs[defs.ACQUISITION].values()) # sort Acquisitions by OrderNumber - r.acquisitions.sort(key=lambda x: x.get_property(ORDERNUMBER)) + r.acquisitions.sort(key=lambda x: x.get_property(defs.ORDERNUMBER)) for q in r.acquisitions: - q.channels = list(q.childs[ACQUISITIONCHANNEL].values()) + q.channels = list(q.childs[defs.ACQUISITIONCHANNEL].values()) # sort Channels by OrderNumber - q.channels.sort(key=lambda x: x.get_property(ORDERNUMBER)) + q.channels.sort(key=lambda x: x.get_property(defs.ORDERNUMBER)) self._assign_acquisition_meta_summary(q, r, p) def _assign_acquisition_meta_summary(self, q: Acquisition, r: AcquisitionRoi, p: Panorama): # meta from acquisition q.meta_summary['q_id'] = q.id - q.meta_summary['q_num'] = q.get_property(ORDERNUMBER) - q.meta_summary['q_timestamp'] = q.get_property(STARTTIMESTAMP) - q.meta_summary['q_description'] = q.get_property(DESCRIPTION) + q.meta_summary['q_num'] = q.get_property(defs.ORDERNUMBER) + q.meta_summary['q_timestamp'] = q.get_property(defs.STARTTIMESTAMP) + q.meta_summary['q_description'] = q.get_property(defs.DESCRIPTION) q.meta_summary['q_maxx'] = q.maxx q.meta_summary['q_maxy'] = q.maxy - q.meta_summary['q_stage_x'] = float(r.roipoints[0].get_property(SLIDEXPOSUM)) - q.meta_summary['q_stage_y'] = float(r.roipoints[0].get_property(SLIDEYPOSUM)) - q.meta_summary['q_laser_power'] = q.get_property(ABLATIONPOWER) + q.meta_summary['q_stage_x'] = float(r.roipoints[0].get_property(defs.SLIDEXPOSUM)) + q.meta_summary['q_stage_y'] = float(r.roipoints[0].get_property(defs.SLIDEYPOSUM)) + q.meta_summary['q_laser_power'] = q.get_property(defs.ABLATIONPOWER) q.meta_summary['q_resolution_xy'] = 1.0 # always 1 um q.meta_summary['q_n_channels'] = len(q.channels) channel_meta = [] for c in q.channels: # format: marker -> target channel_meta.append({ - 'metal': c.get_property(CHANNELNAME), - 'target': c.get_property(CHANNELLABEL)}) + 'metal': c.get_property(defs.CHANNELNAME), + 'target': c.get_property(defs.CHANNELLABEL)}) q.meta_summary['q_channels'] = channel_meta # meta from panorama q.meta_summary['p_id'] = p.id diff --git a/imc2zarr/imclib/mcdmeta.py b/imc2zarr/imclib/mcdmeta.py index 7e0732f..f91da08 100644 --- a/imc2zarr/imclib/mcdmeta.py +++ b/imc2zarr/imclib/mcdmeta.py @@ -1,8 +1,6 @@ from collections import OrderedDict -import os -import csv -from .metadefs import * +import metadefs as defs """ This module should help parsing the MCD xml metadata @@ -39,7 +37,7 @@ class Meta(object): """ self.mtype = mtype - self.id = meta.get(ID, None) + self.id = meta.get(defs.ID, None) # if self.id: # self.id = int(self.id) self.childs = dict() @@ -98,66 +96,66 @@ class Meta(object): # Definition of the subclasses class Slide(Meta): def __init__(self, meta, parents): - Meta.__init__(self, SLIDE, meta, parents, "s") + Meta.__init__(self, defs.SLIDE, meta, parents, "s") class Panorama(Meta): def __init__(self, meta, parents): self.acquisitionrois = [] - Meta.__init__(self, PANORAMA, meta, parents, "p") + Meta.__init__(self, defs.PANORAMA, meta, parents, "p") class AcquisitionRoi(Meta): def __init__(self, meta, parents): self.roipoints = [] - Meta.__init__(self, ACQUISITIONROI, meta, parents, "r") + Meta.__init__(self, defs.ACQUISITIONROI, meta, parents, "r") class Acquisition(Meta): def __init__(self, meta, parents): self.meta_summary = OrderedDict() self.channels = [] - Meta.__init__(self, ACQUISITION, meta, parents, "a") + Meta.__init__(self, defs.ACQUISITION, meta, parents, "a") def get_channels(self): - return self.childs[ACQUISITIONCHANNEL] + return self.childs[defs.ACQUISITIONCHANNEL] def get_channel_orderdict(self): chan_dic = self.get_channels() out_dic = dict() for k, chan in chan_dic.items(): - channel_name = chan.properties[CHANNELNAME] - channel_label = chan.properties.get(CHANNELLABEL, channel_name) - channel_order = int(chan.properties.get(ORDERNUMBER)) + channel_name = chan.properties[defs.CHANNELNAME] + channel_label = chan.properties.get(defs.CHANNELLABEL, channel_name) + channel_order = int(chan.properties.get(defs.ORDERNUMBER)) out_dic.update({channel_order: (channel_name, channel_label)}) return out_dic @property def data_offset_start(self): - return int(self.properties[DATASTARTOFFSET]) + return int(self.properties[defs.DATASTARTOFFSET]) @property def data_offset_end(self): - return int(self.properties[DATAENDOFFSET]) + return int(self.properties[defs.DATAENDOFFSET]) @property def data_size(self): - return self.data_offset_end - self.data_offset_start #+ 1 + return self.data_offset_end - self.data_offset_start # + 1 @property def data_nrows(self): nrow = int( - self.data_size / (self.n_channels * int(self.properties[VALUEBYTES])) + self.data_size / (self.n_channels * int(self.properties[defs.VALUEBYTES])) ) return nrow @property def maxx(self): - return int(self.get_property(MAXX)) + return int(self.get_property(defs.MAXX)) @property def maxy(self): - return int(self.get_property(MAXY)) + return int(self.get_property(defs.MAXY)) @property def n_channels(self): @@ -165,7 +163,7 @@ class Acquisition(Meta): @property def value_bytes(self): - return abs(int(self.properties[VALUEBYTES])) + return abs(int(self.properties[defs.VALUEBYTES])) @property def expected_data_size(self): @@ -174,19 +172,19 @@ class Acquisition(Meta): @property def expected_data_nrows(self): nrow = int( - self.expected_data_size / (self.n_channels * int(self.properties[VALUEBYTES])) + self.expected_data_size / (self.n_channels * int(self.properties[defs.VALUEBYTES])) ) return nrow class RoiPoint(Meta): def __init__(self, meta, parents): - Meta.__init__(self, ROIPOINT, meta, parents, "rp") + Meta.__init__(self, defs.ROIPOINT, meta, parents, "rp") class Channel(Meta): def __init__(self, meta, parents): - Meta.__init__(self, ACQUISITIONCHANNEL, meta, parents, "c") + Meta.__init__(self, defs.ACQUISITIONCHANNEL, meta, parents, "c") # A dictionary to map metadata keys to metadata types @@ -194,20 +192,20 @@ class Channel(Meta): # order these objects should be initialized OBJ_DICT = OrderedDict( [ - (SLIDE, Slide), - (PANORAMA, Panorama), - (ACQUISITIONROI, AcquisitionRoi), - (ACQUISITION, Acquisition), - (ROIPOINT, RoiPoint), - (ACQUISITIONCHANNEL, Channel), + (defs.SLIDE, Slide), + (defs.PANORAMA, Panorama), + (defs.ACQUISITIONROI, AcquisitionRoi), + (defs.ACQUISITION, Acquisition), + (defs.ROIPOINT, RoiPoint), + (defs.ACQUISITIONCHANNEL, Channel), ] ) # A dictionary to map id keys to metadata keys # Used for initializaiton of the objects ID_DICT = { - SLIDEID: SLIDE, - PANORAMAID: PANORAMA, - ACQUISITIONROIID: ACQUISITIONROI, - ACQUISITIONID: ACQUISITION, -} \ No newline at end of file + defs.SLIDEID: defs.SLIDE, + defs.PANORAMAID: defs.PANORAMA, + defs.ACQUISITIONROIID: defs.ACQUISITIONROI, + defs.ACQUISITIONID: defs.ACQUISITION, +} diff --git a/imc2zarr/imclib/mcdutils.py b/imc2zarr/imclib/mcdutils.py index 0aca191..86f13d5 100644 --- a/imc2zarr/imclib/mcdutils.py +++ b/imc2zarr/imclib/mcdutils.py @@ -1,18 +1,18 @@ +import array import mmap -import re, os -import binascii +import os from collections import defaultdict -import array -import pandas as pd + import numpy as np +import pandas as pd from .metadefs import MCDSCHEMA, ID, ORDERNUMBER class McdUtils: """Static method helpers for parsing MCD file format""" - _start_str="<MCDSchema" - _stop_str="</MCDSchema>" + _start_str = "<MCDSchema" + _stop_str = "</MCDSchema>" _meta_length = 100 * 1024 ** 2 @staticmethod @@ -120,10 +120,10 @@ class McdUtils: def valid_txt_file(fh, valid_lines=2): fh.seek(0) valid = False - l = 0 + num_lines = 0 for line in fh: - l += 1 - if l >= valid_lines: + num_lines += 1 + if num_lines >= valid_lines: valid = True break return valid @@ -206,7 +206,7 @@ class McdUtils: offset = -2 * bsize + overlap first_start = True while cur_pos >= 0: - #print('seeking..') + # print('seeking..') f.seek(cur_pos) buf = f.read(bsize) if buf: @@ -269,4 +269,4 @@ class McdUtils: def xml2dict(xml): dic = McdUtils._etree_to_dict(xml) dic = dic[MCDSCHEMA] - return dic \ No newline at end of file + return dic diff --git a/imc2zarr/imclib/mcdxmlparser.py b/imc2zarr/imclib/mcdxmlparser.py index afcd6b1..0bbeda7 100644 --- a/imc2zarr/imclib/mcdxmlparser.py +++ b/imc2zarr/imclib/mcdxmlparser.py @@ -1,6 +1,8 @@ +import os import xml.etree.ElementTree as et -from .mcdmeta import * +import metadefs as defs +from .mcdmeta import Meta, OBJ_DICT, ID_DICT from .mcdutils import McdUtils @@ -12,10 +14,10 @@ class McdXmlParser(Meta): def __init__(self, xml, filename=None): self._rawxml = xml meta = McdUtils.xml2dict(xml) - Meta.__init__(self, MCDSCHEMA, meta, []) + Meta.__init__(self, defs.MCDSCHEMA, meta, []) self._init_objects() if filename is None: - filename = list(self.childs[SLIDE].values())[0].properties[FILENAME] + filename = list(self.childs[defs.SLIDE].values())[0].properties[defs.FILENAME] self.filename = filename @property @@ -87,13 +89,13 @@ class McdXmlParser(Meta): """ gets a list of all acquisitions """ - return self.objects[ACQUISITION] + return self.objects[defs.ACQUISITION] def get_acquisition_meta(self, acid): """ Returns the acquisition metadata dict """ - return self._get_object(ACQUISITION, acid).properties + return self._get_object(defs.ACQUISITION, acid).properties def get_acquisition_rois(self): """ diff --git a/imc2zarr/imclib/metadefs.py b/imc2zarr/imclib/metadefs.py index fd91fe4..d69ab03 100644 --- a/imc2zarr/imclib/metadefs.py +++ b/imc2zarr/imclib/metadefs.py @@ -68,4 +68,4 @@ TEMPLATE = "Template" UID = "UID" VALUEBYTES = "ValueBytes" WIDTHUM = "WidthUm" -SWVERSION = "SwVersion" \ No newline at end of file +SWVERSION = "SwVersion" -- GitLab