Source code for diviner.data_prep

import pandas as pd
import numpy as np
from . import divconstants as c
from scipy import ndimage as nd
from datetime import datetime

###
### general tools for data preparation
###


[docs]def format_time(intime): t = intime.to_pydatetime() s = t.strftime('%Y-%m-%d %H:%M:%S.%f') tail = s[-7:] f = round(float(tail), 3) return pd.Timestamp(s[:-7] + str(f)[1:])
[docs]def generate_date_index(dataframe): """Parse date fields/columns with pandas date converter parsers. Parse the date columns and create a date index from it In: pandas dataframe read in from diviner div38 data Out: DatetimeIndex """ d = dataframe try: d.second = np.round(d.second * 1000) / 1000 date_index = pd.io.date_converters.parse_all_fields( d.year, d.month, d.date, d.hour, d.minute, d.second) except AttributeError: d.ss = np.round(d.ss * 1000) / 1000 date_index = pd.io.date_converters.parse_all_fields( d.yyyy, d.mm, d.dd, d.hh, d.mn, d.ss) return date_index
[docs]def index_by_time(df, drop_dates=True): "must return a new df because the use of drop" df.index = generate_date_index(df) # force 3-digit precision on time stamp # newdf.index = (pd.Series(newdf.index)).map(format_time) if drop_dates: try: cols_to_drop = ['year', 'month', 'date', 'hour', 'minute', 'second'] df.drop(cols_to_drop, axis=1, inplace=True) except ValueError: cols_to_drop = ['yyyy', 'mm', 'dd', 'hh', 'mn', 'ss'] df.drop(cols_to_drop, axis=1, inplace=True) return df
[docs]def cutoff_msec(ts): dtime = ts.to_pydatetime() return pd.Timestamp(datetime(dtime.year, dtime.month, dtime.day, dtime.hour, dtime.minute, dtime.second))
[docs]def parse_divdata_times(df, drop_dates=True): format = "%Y%m%d%H%M" timecols = 'year month date hour minute'.split() subdf = df[timecols].astype('int') seconds = pd.Series(df.second*1e9, dtype='timedelta64[ns]') up_to_min = pd.to_datetime(subdf.year*int(1e8) + subdf.month*int(1e6) + subdf.date*int(1e4) + subdf.hour*int(1e2) + subdf.minute, format=format, utc=False) times = up_to_min + seconds index = pd.DatetimeIndex(times) ts = pd.TimeSeries(index.map(cutoff_msec)) ms = pd.Series(index.microsecond.round(-3)*1000, dtype='timedelta64[ns]') df.index = ts + ms return df.drop(timecols+['second'], axis=1) if drop_dates else df
[docs]def prepare_data(df_in): """Declare NaN value and pad nan data for some.""" # df = index_by_time(df_in) # df[df == -9999.0] = nan if not 'last_el_cmd' in df_in.columns: df_in.rename(columns={'el_cmd': 'last_el_cmd', 'az_cmd': 'last_az_cmd'}, inplace=True) df_in.last_el_cmd.replace([np.nan], inplace=True) df_in.last_az_cmd.replace([np.nan], inplace=True) df_in.moving.replace([np.nan], inplace=True) return df_in
[docs]def get_sv_selector(df): "Create dataframe selector for pointing limits of divconstants 'c' file" return (df.last_az_cmd >= c.SV_AZ_MIN) & \ (df.last_az_cmd <= c.SV_AZ_MAX) & \ (df.last_el_cmd >= c.SV_EL_MIN) & \ (df.last_el_cmd <= c.SV_EL_MAX)
[docs]def get_bb_selector(df): "Create dataframe selector for pointing limits of divconstants 'c' file" return (df.last_az_cmd >= c.BB_AZ_MIN) & \ (df.last_az_cmd <= c.BB_AZ_MAX) & \ (df.last_el_cmd >= c.BB_EL_MIN) & \ (df.last_el_cmd <= c.BB_EL_MAX)
[docs]def get_st_selector(df): "Create dataframe selector for pointing limits of divconstants 'c' file" return (df.last_az_cmd >= c.ST_AZ_MIN) & \ (df.last_az_cmd <= c.ST_AZ_MAX) & \ (df.last_el_cmd >= c.ST_EL_MIN) & \ (df.last_el_cmd <= c.ST_EL_MAX)
[docs]def get_stowed_selector(df): return (df.last_az_cmd == 0) & (df.last_el_cmd == 0)
[docs]def define_sdtype(df): df['sdtype'] = 0 df.loc[get_sv_selector(df), 'sdtype'] = 1 df.loc[get_bb_selector(df), 'sdtype'] = 2 df.loc[get_st_selector(df), 'sdtype'] = 3 df.loc[get_stowed_selector(df), 'sdtype'] = -2 # the following defines the sequential list of calibration blocks inside # the dataframe. nd.label provides an ID for each sequential part where # the given condition is true. # this still includes the moving areas, because i want the sv and bbv # attached to each other to deal with them later as a separate calibration # block # DECISION: block labels contain moving data as well # WARNING: But not all moving data is contained in block labels! # The end of calib block has pointing commands set to nadir. # below defined "is_xxx" do NOT contain moving data. df['calib_block_labels'] = nd.label((df.sdtype == 1) | (df.sdtype == 2) | (df.sdtype == 3))[0] # this resets data from sdtypes >0 above that is still 'moving' to be # sdtype=-1 (i.e. 'moving', defined by me) # doing this *after* the calib block labels are defined above ensures that # the spaceviews and bb/st views are connected in one calib block. # But resetting moving data now to -1 ensures that I don't get empty moving # data marked as bb_block_labels df.loc[df.moving == 1, 'sdtype'] = -1 df['space_block_labels'] = nd.label(df.sdtype == 1)[0] df['bb_block_labels'] = nd.label(df.sdtype == 2)[0] df['st_block_labels'] = nd.label(df.sdtype == 3)[0] # now I don't need to check for moving anymore, the sdtypes are clean df['is_spaceview'] = (df.sdtype == 1) df['is_bbview'] = (df.sdtype == 2) df['is_stview'] = (df.sdtype == 3) df['is_moving'] = (df.sdtype == -1) df['is_stowed'] = (df.sdtype == -2) df['is_calib'] = df.is_spaceview | df.is_bbview | df.is_stview
# this does the same as above labeling, albeit here the blocks are numbered # individually. Not sure I will need it but might come in handy.