#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created by schaffer at 11/6/19 Collection of utility functions specific to GRIS """ import datetime import re from glob import glob from os.path import basename import pandas as pd def get_observers(logfile): return extract_names(get_candidates(logfile)) def get_candidates(logfile): with open(logfile, 'r') as f: text = f.read().strip() text = '\n'.join(text.split('\n')[:5]) observer_parts = [] patterns = [ r'[a-zA-Z\-]*[ \t\r\f\v]*(?:observer|remote)+\w*[ \t\r\f\v]*:?\s+(.+)\n', r'(?:\n|^)*\s*([a-z ,\.]+)\n', ] for p in patterns: res = re.findall(p, text, re.IGNORECASE) if res: for r in res: observer_parts.append(r) break return ','.join(observer_parts) def extract_names(obs_candidates): # check for Capitalized names obs_candidates.replace(' and ', ',') observers = [c for c in obs_candidates.split(',') if re.search(r'[A-Z][a-z]', c)] return observers def observer_data_frame(): files = glob('/dat/sdc/gris/*/*.txt') logfiles = list(filter(lambda x: re.match(r'\d{8}\.txt', basename(x)), files)) observers = [] dates = [] for f in logfiles: observers.append(','.join(get_observers(f))) date = datetime.datetime.strptime(basename(f), '%Y%m%d.txt') dates.append(date) df = pd.DataFrame(dict(date=dates, observers=observers)) df = df.set_index('date') print(df.head())