Source code for intake_astro.table

from collections import OrderedDict
import copy
from intake.source.base import DataSource, Schema
from . import __version__

[docs]class FITSTableSource(DataSource): """Read FITS tabular data into dataframes For one or more FITS files, which can be local or remote, with support for partitioning within files. Parameters ---------- url: str or list of str files to load. Can include protocol specifiers and/or glob characters ext: str or int Extension to load. Normally 0 or 1. chunksize: int or None For partitioning within files, use this many rows per partition. This is very inefficient for compressed files, and for remote files, will require at least touching each file to discover the number of rows, before even starting to read the data. Cannot be used with FITS tables with a "heap", i.e., containing variable- length arrays. storage_options: dict or None Additional keyword arguments to pass to the storage back-end. metadata: Arbitrary information to associate with this source. After reading the schema, the source will have attributes: ``header`` - the full FITS header of one of the files as a dict, ``dtype`` - a numpy-like list of field/dtype string pairs, ``shape`` - where the number of rows will only be known if using partitioning or for a single file input. """ name = 'fits_table' container = 'dataframe' version = __version__ partition_access = True def __init__(self, url, ext=0, chunksize=None, storage_options=None, metadata=None): super(FITSTableSource, self).__init__(metadata) self.url = url self.ext = ext self.chunks = chunksize self.storage_options = storage_options or {} self.df = None self.files = None def _get_schema(self): from dask.bytes import open_files import dask.dataframe as dd from dask.base import tokenize import dask if self.df is None: self.files = open_files(self.url, **self.storage_options) name = 'fits-table-' + tokenize(self.url, self.chunks, self.ext) dpart = dask.delayed(_get_fits_section) parts = [] dtype = None length = 0 for part in self.files: if self.chunks: header, dtype, shape = _get_fits_header(part, self.ext) l = shape[0] for start in range(0, l, self.chunks): section = (start, min(start + self.chunks, l)) parts.append(dpart(part, self.ext, section)) length += l else: if dtype is None: header, dtype, shape = _get_fits_header(part, self.ext) if len(self.files) == 1: # if not sectioning, we don't try to find the total # number of rows, so we only know this for exactly # one file length = shape[0] parts.append(dpart(part, self.ext, None)) self.header, self.dtype, self.shape = header, dtype, ( (length or None), shape[1]) self.df = dd.from_delayed(parts, prefix=name, meta=dtype) self._schema = Schema( dtype=self.dtype, shape=self.shape, extra_metadata=self.header, npartitions=self.df.npartitions ) return self._schema
[docs] def to_dask(self): self._get_schema() return self.df
[docs] def read_chunked(self): return self.to_dask()
[docs] def read_partition(self, i): self._get_schema() return self.df.get_partition(i).compute()
[docs] def read(self): self._get_schema() return self.df.compute()
def _close(self): self.df = None self.files = None
def _get_fits_section(fn, ext=0, section=None): import numpy as np import pandas as pd with copy.copy(fn) as f: if section is None: from astropy.table import Table t =, hdu=ext, format='fits') return t.to_pandas() else: start, end = section from import open from import _FormatP, _FormatQ # copied from hdu._get_tbdata() hdus = open(f, memmap=False, cache=False) hdu = hdus[ext] if (any(type(r) in (_FormatP, _FormatQ) for r in hdu.columns._recformats) and hdu._data_size is not None and hdu._data_size > hdu._theap): raise ValueError('Can only read sections from tables without ' 'heap; use section=None') data_offset = hdu._data_offset + hdu.columns.dtype.itemsize * start raw_data = hdu._get_raw_data((end - start), hdu.columns.dtype, data_offset) data = raw_data.view(np.rec.recarray) hdu._init_tbdata(data) data = data.view(hdu._data_type) data._coldefs = hdu.columns data._character_as_bytes = hdu._character_as_bytes # byteswapping by copy out = np.empty(len(data), dtype=_dtypes(hdu)) for col in out.dtype.fields: out[col][:] = data[col] df = pd.DataFrame(out) return df def _get_fits_header(fn, ext=0): with copy.copy(fn) as f: from import open hdu = open(f, memmap=False, cache=False)[ext] return dict(hdu.header.items()), _dtypes(hdu), ( hdu._nrows, len(hdu.columns)) def _dtypes(hdu): dt = hdu.columns.dtype.newbyteorder('=').descr dt2 = [] for i, (field, d) in enumerate(dt): # TODO: may have other special types, perhaps in special function if hdu.header['TFORM%i' % (i + 1)] == "L": dt2.append((field, '|b1')) elif hdu.columns[field].bzero == 32768: # convention for unsigned type dt2.append((field, d.replace('i', 'u'))) else: dt2.append((field, d)) return dt2