45P Spectroscopic Preprocessing

2. 45P Spectroscopic Preprocessing#

This is a note demonstrating the preprocessing of longslit spectroscopy data for 45P. The resulting preprocessed data will be used for a spectroscopic example lecture note in Python Example of Longslit Spectroscopy.

Note

I emphasize again that this lecture note does not teach preprocessing. If preprocessing-related lecture is required, I recommend Matthew Craig’s CCD Data Reduction Guide.

# %matplotlib notebook
%config InlineBackend.figure_format = 'retina'
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'last_expr' 
import numpy as np
from astropy.nddata import CCDData
from matplotlib import pyplot as plt
from astropy.stats import sigma_clipped_stats

# We need to do it in a separate cell. See:
# https://github.com/jupyter/notebook/issues/3385
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'latin modern math', 'font.size':12, 'mathtext.fontset':'stix', 
    'axes.formatter.use_mathtext': True, 'axes.formatter.limits': (-4, 4),
    'axes.grid': True, 'grid.color': 'gray', 'grid.linewidth': 0.5, 
    'xtick.top': True, 'ytick.right': True, 
    'xtick.direction': 'inout', 'ytick.direction': 'inout',
    'xtick.minor.size': 2.0, 'ytick.minor.size': 2.0,  # default 2.0
    'xtick.major.size': 4.0, 'ytick.major.size': 4.0,  # default 3.5
    'xtick.minor.visible': True, 'ytick.minor.visible': True
})
import ysfitsutilpy as yfu
from pathlib import Path

import warnings
from astropy.utils.exceptions import AstropyWarning
warnings.filterwarnings('ignore', append=True, category=AstropyWarning)

DATAPATH = Path('../../Tutorial_Data/45P/rawdata')
CALPATH = Path('../../Tutorial_Data/45P/calib')
REDPATH = Path('../../Tutorial_Data/45P/reduced')
CALPATH.mkdir(exist_ok=True, parents=True)
REDPATH.mkdir(exist_ok=True, parents=True)

USEFUL_KEYWORDS = ["FILE_ID", "DATE-OBS", "EXPTIME", "OBJECT", "IMAGETYP", "AIRMASS", 
                   "ALTITUDE", "AZIMUTH", "GRATING", "ORDERCUT", "SLIT-WID", ]

TRIMSEC = '[:, 250:1300]'  # for 45P

summary = yfu.make_summary(DATAPATH/"**.fits", verify_fix=True, keywords=USEFUL_KEYWORDS, verbose=0)
summary

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[1], line 41
USEFUL_KEYWORDS = ["FILE_ID", "DATE-OBS", "EXPTIME", "OBJECT", "IMAGETYP", "AIRMASS", 
                  "ALTITUDE", "AZIMUTH", "GRATING", "ORDERCUT", "SLIT-WID", ]
TRIMSEC = '[:, 250:1300]'  # for 45P
---> 41 summary = yfu.make_summary(DATAPATH/"**.fits", verify_fix=True, keywords=USEFUL_KEYWORDS, verbose=0)
summary

File ~/Dropbox/github/ysfitsutilpy/ysfitsutilpy/filemgmt.py:286, in make_summary(inputs, extension, verify_fix, fname_option, output, keywords, example_header, sort_by, sort_map, fullmatch, flags, querystr, negate_fullmatch, verbose)
# Run through all the fits files
for i, item in enumerate(fitslist):
--> 286     fname, fsize, hdr = _get_fname_fsize_hdr(item, i, extension=extension)
   summarytab["file"].append(fname)
   summarytab["filesize"].append(fsize)

File ~/Dropbox/github/ysfitsutilpy/ysfitsutilpy/filemgmt.py:233, in make_summary.<locals>._get_fname_fsize_hdr(item, idx, extension)
fsize = Path(item).stat().st_size
# Don't change to MB/GB, which will make it float...
--> 233 hdul = fits.open(item)
if verify_fix:
   hdul.verify('fix')

File ~/mambaforge/lib/python3.10/site-packages/astropy/io/fits/hdu/hdulist.py:213, in fitsopen(name, mode, memmap, save_backup, cache, lazy_load_hdus, ignore_missing_simple, use_fsspec, fsspec_kwargs, **kwargs)
if not name:
   raise ValueError(f"Empty filename: {name!r}")
--> 213 return HDUList.fromfile(
   name,
   mode,
   memmap,
   save_backup,
   cache,
   lazy_load_hdus,
   ignore_missing_simple,
   use_fsspec=use_fsspec,
   fsspec_kwargs=fsspec_kwargs,
   **kwargs,
)

File ~/mambaforge/lib/python3.10/site-packages/astropy/io/fits/hdu/hdulist.py:476, in HDUList.fromfile(cls, fileobj, mode, memmap, save_backup, cache, lazy_load_hdus, ignore_missing_simple, **kwargs)
@classmethod
def fromfile(
   cls,
   (...)
   **kwargs,
):
   """
   Creates an `HDUList` instance from a file-like object.

   (...)
   documentation for details of the parameters accepted by this method).
   """
--> 476     return cls._readfrom(
       fileobj=fileobj,
       mode=mode,
       memmap=memmap,
       save_backup=save_backup,
       cache=cache,
       ignore_missing_simple=ignore_missing_simple,
       lazy_load_hdus=lazy_load_hdus,
       **kwargs,
   )

File ~/mambaforge/lib/python3.10/site-packages/astropy/io/fits/hdu/hdulist.py:1224, in HDUList._readfrom(cls, fileobj, data, mode, memmap, cache, lazy_load_hdus, ignore_missing_simple, use_fsspec, fsspec_kwargs, **kwargs)
   if hdulist._file.close_on_error:
       hdulist._file.close()
-> 1224     raise OSError("Empty or corrupt FITS file")
if not lazy_load_hdus or kwargs.get("checksum") is True:
   # Go ahead and load all HDUs
   while hdulist._read_next_hdu():

OSError: Empty or corrupt FITS file

Example

Look into the header of any file. What is the keyword for the start of the exposure in UT? Is it DATE-OBS or DATE?
Look into the data of a flat file. You can see a vast area of each file is useless (no light). Thus, cropping out central region, you can save nearly half the disk storage! (see TRIMSEC in our code)

Now look at the observation sequence:

fig, axs = plt.subplots(1, 1, figsize=(7, 4))
ax2 = axs.twinx()
axs.grid(False, axis="y")
ax2.grid(False)
axs.plot(summary["FILE_ID"], summary["OBJECT"], "b+:")
ax2.plot(summary["FILE_ID"], summary["EXPTIME"], "r+")
axs.set(xlabel="File ID", ylabel="Object", title="Obs. Summary")
ax2.set(yscale="log")
for ax, lab, color in zip([axs, ax2], ['OBJECT', 'EXPTIME [sec]'], ['b', 'r']):
    ax.set_ylabel(lab, color=color)
    ax.tick_params(axis='y', which='both', color=color, labelcolor=color)

plt.tight_layout()
plt.show();

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 5
      3 axs.grid(False, axis="y")
      4 ax2.grid(False)
----> 5 axs.plot(summary["FILE_ID"], summary["OBJECT"], "b+:")
      6 ax2.plot(summary["FILE_ID"], summary["EXPTIME"], "r+")
      7 axs.set(xlabel="File ID", ylabel="Object", title="Obs. Summary")

NameError: name 'summary' is not defined

../_images/8942e8f6d219612cea8f82b6d788e48246d22aad6aa62c410e9f3ee93afeed39.png

It is composed of

Star (HD 129184), 5 exposures (exposure time varied)
Comparison lamp, 6 exposures
Flat lamp, 10 exposures
Target (45P), 3 exposures (exposure time varied)
Star (HD 129184), 2 exposures
Dark frames with varying exposure times.

Sometimes, when the accuracy of the λ matters (e.g., redshift measurements), a comparison image is taken multiple times before or after taking the target images. This is because the µm-scale distortion of the instrument due to gravity, wind, thermal expansion, etc. during the observation can change the λ(x, y) function. One pixel difference can make a huge error in, e.g., the redshift. In this observation, that’s not the case, so I can simply merge (combine) all the six comparison lamp images to make the “master comparison lamp” image. Similarly, I can make the master flat and dark.

If you have looked into the raw comparison lamp data, you may notice the images change over exposure. However, if you find the peak position of each emission line, you will notice the peak position is relatively stable within ≲ 0.5 pixel. Thus, it is justified to use all comparison lamp images together.

2.1. Master Dark#

# Master dark
_medcomb_kw = dict(combine="median", reject="sc", output_verify="fix", overwrite=True, trimsec=TRIMSEC)

summary_dark = summary.loc[summary["OBJECT"] == "DARK"].copy()
mdarks = {}
for exptime in summary_dark["EXPTIME"].unique():
    _s = summary_dark.loc[summary_dark["EXPTIME"] == exptime].copy()
    mdarks[exptime] = yfu.imcombine(
        _s["file"].tolist(), **_medcomb_kw, output=CALPATH/f"mdark_{exptime:.1f}.fits"
    )

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 4
      1 # Master dark
      2 _medcomb_kw = dict(combine="median", reject="sc", output_verify="fix", overwrite=True, trimsec=TRIMSEC)
----> 4 summary_dark = summary.loc[summary["OBJECT"] == "DARK"].copy()
      5 mdarks = {}
      6 for exptime in summary_dark["EXPTIME"].unique():

NameError: name 'summary' is not defined

2.2. Important Note on Dark#

Unfortunately, we do not have bias or dark with EXPTIME=1200.

Also, if you look at EXPTIME=600 dark images (especially the combined image), you see a clear sign of a light leak. Hence, using 600s dark may be problematic (depending on the cause of the light leak).

Therefore, we cannot scale dark images nor subtract 1200s dark from “45P” images with that exposure. How should we subtract dark properly in this case? This is a possible workaround:

First, using the raw images of the shortest exposure (0.5s, because there is no real bias image with EXPTIME==0), get a rough estimate of the readout noise in ADU. Call it rdn_adu, which is roughly the standard deviation of the pixels in a frame: rdn_adu \(\approx\) std(dark_0.5s).

We will have 10 such estimates. Take the median of them and use it as the rdn_adu.

A pixel in any master dark frame will have an uncertainty (standard error) of at least rdn_adu\(/\sqrt{n}\)

\(n = 10\), is the number of frames combined.

If diff = mdark_20s - mdark_0.5s, each pixel will have roughly \(2\times\)rdn_adu\(/\sqrt{n}\) uncertainty. Then we can make a “hot pixel mask” by using mask = diff > k*sigma (k-sigma), where sigma = 2*rdn_adu/√10 (note, we have to multiply 2 here).
For these pixels, dark_adups = diff[mask]/19.5 will be the “dark ADU/sec” map.
For any exposure time, mdark_0.5s + dark_adups*EXPTIME will be a possible dark map.

dark1d = mdarks[20.].data.flatten()
ntop = 10
idxs = np.argpartition(dark1d, -ntop)[-ntop:]

topvals = {}
for _, row in summary_dark.iterrows():
    _topvals = (yfu.imslice(CCDData.read(row["file"], unit="adu"), trimsec=TRIMSEC).data.flatten())[idxs]
    topvals.setdefault(row["EXPTIME"], []).append(_topvals)

for key in topvals:
    topvals[key] = np.array(topvals[key])
    
fig, axs = plt.subplots(1, 1, figsize=(7, 4))

for idx in range(ntop):
    zero = np.median(topvals[0.5][:, idx])
    for exptime in topvals.keys():
        vals = topvals[exptime][:, idx]
        axs.errorbar(exptime, (np.median(vals) - zero)/(exptime-0.5), yerr=np.std(vals), color=f"C{idx}", 
                     marker="x", ls="none", capsize=3)
        
axs.set(xscale="log", yscale="log", xlabel="Exposure Time [sec]", ylabel="Counts")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 dark1d = mdarks[20.].data.flatten()
      2 ntop = 10
      3 idxs = np.argpartition(dark1d, -ntop)[-ntop:]

NameError: name 'mdarks' is not defined

topvals_meds

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 topvals_meds

NameError: name 'topvals_meds' is not defined

topvals[exptime][:, idx]

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 topvals[exptime][:, idx]

NameError: name 'topvals' is not defined

import pandas as pd
pd.DataFrame.from_dict(topvals)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 2
      1 import pandas as pd
----> 2 pd.DataFrame.from_dict(topvals)

NameError: name 'topvals' is not defined

yfu.imslice?

fig, axs = plt.subplots(1, 1, figsize=(8, 5), sharex=False, sharey=False, gridspec_kw=None)

axs.plot(dark1d)
[axs.axvline(i, color="k", alpha=0.5) for i in idxs]

raveled = np.zeros_like(dark1d)
for i in range(66, 70):
    ccd = yfu.imslice(CCDData.read(summary_dark.loc[i]["file"], unit="adu"), trimsec=TRIMSEC)
    raveled += ccd.data.ravel()
axs.plot(raveled/4, "r", lw=1, alpha=0.8)

plt.tight_layout()
plt.show();

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 3
      1 fig, axs = plt.subplots(1, 1, figsize=(8, 5), sharex=False, sharey=False, gridspec_kw=None)
----> 3 axs.plot(dark1d)
      4 [axs.axvline(i, color="k", alpha=0.5) for i in idxs]
      6 raveled = np.zeros_like(dark1d)

NameError: name 'dark1d' is not defined

../_images/9b3bd5b9c542c96aa9778b950d922712a26285953564e16bcbae2e822a57870d.png

raveled

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 raveled

NameError: name 'raveled' is not defined

# rough read noise in ADU:
rdn_adus = []
for _, row in summary_dark.iterrows():
    if row["EXPTIME"] == 0.5:
        rdn_adus.append(sigma_clipped_stats(CCDData.read(row["file"], unit="adu"))[2])
    
rdn_adu = np.median(rdn_adus)

diff = mdarks[20.].data - mdarks[0.5].data
sigma = 2*rdn_adu/np.sqrt(len(rdn_adus))
mask = diff > 5*sigma
dark_adups = np.zeros_like(diff)
dark_adups[mask] = diff[mask]/19.5
dark_adups = CCDData(dark_adups, header=mdarks[20.].header, unit="adu")
dark_adups.header["EXPTIME"] = 1
dark_adups.write(CALPATH/"dark_adups.fits", overwrite=True, output_verify="fix")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 3
      1 # rough read noise in ADU:
      2 rdn_adus = []
----> 3 for _, row in summary_dark.iterrows():
      4     if row["EXPTIME"] == 0.5:
      5         rdn_adus.append(sigma_clipped_stats(CCDData.read(row["file"], unit="adu"))[2])

NameError: name 'summary_dark' is not defined

From now on, we can use

0.5-sec master dark as a master bias image.
dark_adups as the master dark (with proper scaling with exposure time).

2.3. Master Flat#

# Master flat
#   NOTE: all flat files have EXPTIME 1 sec 
ccdred_kw = dict(mbias=mdarks[0.5], mbiaspath=CALPATH/"mdark_0.5.fits",
                 mdark=dark_adups, mdarkpath=CALPATH/"dark_adups.fits",
                 dark_scale=True, output_verify="fix", overwrite=True)
summary_flat = summary.loc[summary["OBJECT"] == "flat"]["file"]
mflat = yfu.imcombine(summary_flat.tolist(), **_medcomb_kw)
mflat = yfu.ccdred(mflat, **ccdred_kw)
normval = np.mean(mflat.data)
mflat.data /= normval
mflat.header.add_history("Normalized by mean")
mflat.header["NORMALIZ"] = ("mean", "Normalized by median")
mflat.header["NORMVAL"] = (normval, "Value used for the normalization")
mflat.write(CALPATH/"mflat.fits", overwrite=True, output_verify="fix")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 3
      1 # Master flat
      2 #   NOTE: all flat files have EXPTIME 1 sec 
----> 3 ccdred_kw = dict(mbias=mdarks[0.5], mbiaspath=CALPATH/"mdark_0.5.fits",
      4                  mdark=dark_adups, mdarkpath=CALPATH/"dark_adups.fits",
      5                  dark_scale=True, output_verify="fix", overwrite=True)
      6 summary_flat = summary.loc[summary["OBJECT"] == "flat"]["file"]
      7 mflat = yfu.imcombine(summary_flat.tolist(), **_medcomb_kw)

NameError: name 'mdarks' is not defined

2.4. Preprocess#

from astropy.stats import sigma_clipped_stats

# rough read noise in ADU:
rdn_adu = sigma_clipped_stats(mdarks[0.5], sigma=3.0, maxiters=5)[2]  

for _, row in summary.iterrows():
    if row["OBJECT"] not in ["DARK", "flat", "comparison"]:
        fpath = Path(row["file"])
        ccd = yfu.imslice(CCDData.read(fpath, unit="adu"), trimsec=TRIMSEC)
        nccd = yfu.ccdred(
            ccd, **ccdred_kw,
            mflat=mflat, mflatpath=CALPATH/"mflat.fits", 
            flat_mask=1.e-2, flat_fill=1.e-2,
            output=REDPATH/f"{fpath.name}", verbose_bdf=0
        )

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 4
      1 from astropy.stats import sigma_clipped_stats
      3 # rough read noise in ADU:
----> 4 rdn_adu = sigma_clipped_stats(mdarks[0.5], sigma=3.0, maxiters=5)[2]  
      6 for _, row in summary.iterrows():
      7     if row["OBJECT"] not in ["DARK", "flat", "comparison"]:

NameError: name 'mdarks' is not defined

from astropy.stats import sigma_clipped_stats

ccd = CCDData.read(summary["file"].iloc[10], unit="adu")
ccd = yfu.ccdred(
    yfu.imslice(ccd, trimsec=TRIMSEC), mdark=mdark, mdarkpath=mdarkpath,
    mflat=mflat, mflatpath=CALPATH/"mflat.fits", 
    flat_mask=1.e-2, flat_fill=1,
    output=REDPATH/f"{fpath.name}", verbose_bdf=0, overwrite=True
)

fig, axs = plt.subplots(1, 1, figsize=(8, 5), sharex=False, sharey=False, gridspec_kw=None)

#axs[0].
# axs.imshow(ccd.data[:, 2:50])
# axs.imshow(ccd.data[:, -50:-2])
axs.plot(sigma_clipped_stats(ccd.data[:, 2:50], sigma=3, maxiters=5, axis=1)[1])
axs.plot(sigma_clipped_stats(ccd.data[:, -50:-2], sigma=3, maxiters=5, axis=1)[1])
# axs.plot(sigma_clipped_stats(ccd.data[-50:-2, :]+10, sigma=3, maxiters=5, axis=0)[1])
# axs.plot(sigma_clipped_stats(ccd.data[2:50, :]+10, sigma=3, maxiters=5, axis=0)[1])

axs.set_aspect('auto')


plt.tight_layout()
plt.show();

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 3
      1 from astropy.stats import sigma_clipped_stats
----> 3 ccd = CCDData.read(summary["file"].iloc[10], unit="adu")
      4 ccd = yfu.ccdred(
      5     yfu.imslice(ccd, trimsec=TRIMSEC), mdark=mdark, mdarkpath=mdarkpath,
      6     mflat=mflat, mflatpath=CALPATH/"mflat.fits", 
      7     flat_mask=1.e-2, flat_fill=1,
      8     output=REDPATH/f"{fpath.name}", verbose_bdf=0, overwrite=True
      9 )
     11 fig, axs = plt.subplots(1, 1, figsize=(8, 5), sharex=False, sharey=False, gridspec_kw=None)

NameError: name 'summary' is not defined

import _tool_visualization as vis

def mkdark(dark, bias):
    dark = dark.data - bias
    dark[dark < 0] = 0
    return dark

bias = mdarks[0.5].data
mdark600 = mkdark(mdarks[600.], bias)
mdark20 = mkdark(mdarks[20.], bias)
mdark5 = mkdark(mdarks[5.], bias)

nodarkmask = (mdark20 < 10) | (mdark5 < 10)
ratio = mdark20/mdark5
ratio[nodarkmask] = 0

yfu.write2fits(mdark5, header=mdarks[5.].header, output="test.fits", output_verify='fix', overwrite=True)
yfu.write2fits(mdark20, header=mdarks[20.].header, output="test.fits", output_verify='fix', overwrite=True)
yfu.write2fits(ratio, header=mdarks[20.].header, output="test.fits", output_verify='fix', overwrite=True)

yfu.write2fits(mdarks[20.] - mdarks[0.5].data, header=mdarks[20.].header, output="test.fits", output_verify='fix', overwrite=True)


fig, axs = plt.subplots(1, 1, figsize=(8, 5), sharex=False, sharey=False, gridspec_kw=None)

vis.norm_imshow(axs, ratio)

plt.tight_layout()
plt.show();

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 8
      5     dark[dark < 0] = 0
      6     return dark
----> 8 bias = mdarks[0.5].data
      9 mdark600 = mkdark(mdarks[600.], bias)
     10 mdark20 = mkdark(mdarks[20.], bias)

NameError: name 'mdarks' is not defined