import pandas as pd
from pathlib import Path
import os

if __name__ == '__main__':
    
    benchmark_dir = Path(os.environ.get('BUILDINGS_BENCH'))

    oov_list = open('oov.txt', 'w')
    bldgs = set()

    # Read in the list of building csv files
    with open('building_years.txt', 'r') as f:
        building_csvs = f.readlines()

    for bc in building_csvs:
        if not 'Electricity' in bc and not 'BDG-2' in bc:
            continue

        df = pd.read_csv(benchmark_dir / (bc.strip() + '.csv'), index_col=0, header=0, parse_dates=True)
    
        
    
        # for each building in the dataframe, check if any load-hour is >
        for bldg in df.columns:
            if df[bldg].max() > 5100:
                print(bldg, df[bldg].max())
                
                id_ = f'{bc.split("/")[0]} {bldg}'

                if not id_ in bldgs:
                    bldgs.add(id_)
                    # if so, keep track of the dataset and building name
                    oov_list.write(id_ + '\n')
    
    print(f'{len(bldgs)} unique buildings')
    oov_list.close()
