util.py 1.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Created by schaffer at 11/6/19

Collection of utility functions specific to GRIS
"""

import datetime
import re
from glob import glob
from os.path import basename

import pandas as pd


def get_observers(logfile):
    return extract_names(get_candidates(logfile))


def get_candidates(logfile):
    with open(logfile, 'r') as f:
        text = f.read().strip()
    text = '\n'.join(text.split('\n')[:5])

    observer_parts = []
    patterns = [
        r'[a-zA-Z\-]*[ \t\r\f\v]*(?:observer|remote)+\w*[ \t\r\f\v]*:?\s+(.+)\n',
        r'(?:\n|^)*\s*([a-z ,\.]+)\n',
    ]

    for p in patterns:
        res = re.findall(p, text, re.IGNORECASE)
        if res:
            for r in res:
                observer_parts.append(r)
            break

    return ','.join(observer_parts)


def extract_names(obs_candidates):
    # check for Capitalized names
    obs_candidates.replace(' and ', ',')
    observers = [c for c in obs_candidates.split(',') if re.search(r'[A-Z][a-z]', c)]
    return observers


def observer_data_frame():
    files = glob('/dat/sdc/gris/*/*.txt')
    logfiles = list(filter(lambda x: re.match(r'\d{8}\.txt', basename(x)), files))

    observers = []
    dates = []

    for f in logfiles:
        observers.append(','.join(get_observers(f)))
        date = datetime.datetime.strptime(basename(f), '%Y%m%d.txt')
        dates.append(date)

    df = pd.DataFrame(dict(date=dates, observers=observers))
    df = df.set_index('date')

    print(df.head())