SPC Scotland failing

# SPC Scotland failing Currently, Wales and England LADs run and produce data. Scotland LADs produce errors and incomplete data - https://github.com/alan-turing-institute/spc-hpc-pipeline - https://github.com/alan-turing-institute/spc-hpc-pipeline/blob/main/scripts/scp/SPENSER_HPC_setup.sh ## Problem On step 1 of the simulations: ```bash echo 'Step 1' cd microsimulation python -W ignore scripts/run_ssm.py -c config/ssm_current.json $var ``` we get an error of ```python ValueError Traceback (most recent call last) File ~/microsimulation/scripts/run_ssm.py:48 45 if __name__ == "__main__": 47 PARAMS = utils.get_config() ---> 48 main(PARAMS) File ~/microsimulation/scripts/run_ssm.py:36, in main(params) 33 print("Static P Microsimulation: ", region, "@", resolution) 35 # init microsynthesis ---> 36 ssm = Static.SequentialMicrosynthesis(region, resolution, variant, is_custom, cache_dir, output_dir, use_fast_mode) 37 ssm.run(ref_year, horizon_year) 39 print(region, "done. Exec time(s): ", time.time() - start_time) File ~/microsimulation/microsimulation/static.py:57, in SequentialMicrosynthesis.__init__(self, region, resolution, variant, is_custom, cache_dir, output_dir, fast_mode) 53 raise ValueError("fast mode should be boolean") 55 # TODO enable 2001 ref year? 56 # (down)load the census 2011 tables ---> 57 self.__get_census_data() File ~/microsimulation/microsimulation/static.py:190, in SequentialMicrosynthesis.__get_census_data(self) 188 def __get_census_data(self): --> 190 (dc1117, dc2101, dc6206) = self.get_census_data() 192 # add children to adult-only table 193 #dc6206ew_adj = self.append_children(dc1117, dc6206) 194 # For now we drop NS-SEC (not clear if needed) 195 dc6206_adj = None File ~/microsimulation/microsimulation/common.py:26, in Base.get_census_data(self) 24 def get_census_data(self): 25 if self.region[0] == "S": ---> 26 return self.__get_census_data_sc() 27 elif self.region[0] == "N": 28 raise NotImplementedError("NI support not yet implemented") File ~/microsimulation/microsimulation/common.py:51, in Base.__get_census_data_sc(self) 49 dc1117seed = utils.unlistify(dc1117lad, ["DC1117SC_0_CODE", "DC1117SC_1_CODE"], [2, 86], "OBS_VALUE").astype(float) 50 # expand to all geogs within LAD ---> 51 dc1117seed = np.dstack([dc1117seed] * ngeogs).T 53 ga = utils.unlistify(qs103sc, ["GEOGRAPHY_CODE", "QS103SC_0_CODE"], [ngeogs, nages], "OBS_VALUE") 54 gs = utils.unlistify(qs104sc, ["GEOGRAPHY_CODE", "QS104SC_0_CODE"], [ngeogs, nsexes], "OBS_VALUE") File <__array_function__ internals>:180, in dstack(*args, **kwargs) File ~/miniconda/lib/python3.9/site-packages/numpy/lib/shape_base.py:723, in dstack(tup) 721 if not isinstance(arrs, list): 722 arrs = [arrs] --> 723 return _nx.concatenate(arrs, 2) File <__array_function__ internals>:180, in concatenate(*args, **kwargs) ValueError: need at least one array to concatenate ``` This happens because `ngeogs` is 0. ### Why is `ngeogs` 0? Because it is collected from an empty dataset `qs103sc` ```python def __get_census_data_sc(self): print("Synthesising Scottish DC1117/DC2101 tables from LAD-level seeds and univariate data") # age only, no gender qs103sc = self.data_api_sc.get_data("QS103SC", self.region, self.resolution, category_filters={"QS103SC_0_CODE": range(1,102)}) qs103sc = utils.cap_value(qs103sc, "QS103SC_0_CODE", 86, "OBS_VALUE") # sex only qs104sc = self.data_api_sc.get_data("QS104SC", self.region, self.resolution, category_filters={"QS104SC_0_CODE": [1,2]}) ngeogs = len(qs103sc.GEOGRAPHY_CODE.unique()) ``` as shown: ```python ipdb> qs103sc Empty DataFrame Columns: [GEOGRAPHY_CODE, QS103SC_0_CODE, OBS_VALUE] Index: [] ``` because of: ```python self.data_api_sc.get_data("QS103SC", self.region, self.resolution, category_filters={"QS103SC_0_CODE": range(1,102)}) ``` where: ```python ipdb> self.region 'S12000005' ipdb> self.resolution 'MSOA11' ``` ### get_data | NRScotland | UKCensusAPI Here the `get_data` function produces the empty dataframe noted above A few curiosities are present in this function: ```python def get_data(self, table, coverage, resolution, category_filters={}, r_compat=False): """ Returns a table with categories in columns, filtered by geography and (optionally) category values If r_compat==True, instead of returning a pandas dataframe it returns a dict raw value data and column names that can be converted into an R data.frame """ # No data is available for Intermediate zones (~MSOA) so we get Data Zone (LSOA) then aggregate msoa_workaround = False if resolution == "MSOA11": msoa_workaround = True resolution = "LSOA11" ``` Notably the `msoa_workaround` which given will result in `resolution="LSOA11"` Stepping through tis function to the next lines of code ```python geography = self.get_geog(coverage, resolution) meta, raw_data = self.__get_rawdata(table, resolution) ``` Inspecting these we can see that ```python ipdb> geography array(['S01007454', 'S01007453', 'S01007452', 'S01007424', 'S01007433', 'S01007444', 'S01007447', 'S01007480', 'S01007479', 'S01007478', 'S01007466', 'S01007465', 'S01007464', 'S01007463', 'S01007462', 'S01007476', 'S01007474', 'S01007473', 'S01007475', 'S01007472', 'S01007471', 'S01007477', 'S01007470', 'S01007467', 'S01007468', 'S01007461', 'S01007460', 'S01007469', 'S01007459', 'S01007432', 'S01007458', 'S01007455', 'S01007456', 'S01007457', 'S01007413', 'S01007409', 'S01007412', 'S01007411', 'S01007410', 'S01007419', 'S01007418', 'S01007420', 'S01007415', 'S01007416', 'S01007449', 'S01007451', 'S01007450', 'S01007448', 'S01007434', 'S01007436', 'S01007435', 'S01007439', 'S01007438', 'S01007437', 'S01007440', 'S01007442', 'S01007441', 'S01007429', 'S01007428', 'S01007425', 'S01007426', 'S01007427', 'S01007430', 'S01007431', 'S01007421', 'S01007422', 'S01007423', 'S01007443', 'S01007446', 'S01007445', 'S01007417', 'S01007414'], dtype=object) ``` and ```python ipdb> meta {'table': 'QS103SC', 'description': '', 'geography': 'LSOA11', 'fields': {'QS103SC_0_CODE': {0: 'All people', 1: 'Under 1', 2: '1', 3: '2', 4: '3', 5: '4', 6: '5', 7: '6', 8: '7', 9: '8', 10: '9', 11: '10', 12: '11', 13: '12', 14: '13', 15: '14', 16: '15', 17: '16', 18: '17', 19: '18', 20: '19', 21: '20', 22: '21', 23: '22', 24: '23', 25: '24', 26: '25', 27: '26', 28: '27', 29: '28', 30: '29', 31: '30', 32: '31', 33: '32', 34: '33', 35: '34', 36: '35', 37: '36', 38: '37', 39: '38', 40: '39', 41: '40', 42: '41', 43: '42', 44: '43', 45: '44', 46: '45', 47: '46', 48: '47', 49: '48', 50: '49', 51: '50', 52: '51', 53: '52', 54: '53', 55: '54', 56: '55', 57: '56', 58: '57', 59: '58', 60: '59', 61: '60', 62: '61', 63: '62', 64: '63', 65: '64', 66: '65', 67: '66', 68: '67', 69: '68', 70: '69', 71: '70', 72: '71', 73: '72', 74: '73', 75: '74', 76: '75', 77: '76', 78: '77', 79: '78', 80: '79', 81: '80', 82: '81', 83: '82', 84: '83', 85: '84', 86: '85', 87: '86', 88: '87', 89: '88', 90: '89', 91: '90', 92: '91', 93: '92', 94: '93', 95: '94', 96: '95', 97: '96', 98: '97', 99: '98', 100: '99', 101: '100 and over'}}} ipdb> raw_data Unnamed: 0 All people Under 1 1 2 3 4 5 6 7 8 9 10 11 12 ... 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 and over 0 S92000003 5,295,403 58,715 59,556 58,909 58,764 56,877 55,224 55,236 53,786 52,325 53,046 55,067 56,769 58,656 ... 15,898 14,181 12,024 10,778 9,439 6,975 4,331 3,321 2,840 2,241 1,825 1,174 857 539 854 1 S12000005 51,442 615 561 593 620 538 562 534 512 544 556 571 609 606 ... 119 117 111 79 64 51 35 26 22 16 13 9 3 4 4 2 S12000006 151,324 1,487 1,577 1,495 1,563 1,538 1,540 1,558 1,470 1,463 1,347 1,555 1,570 1,681 ... 601 482 444 387 335 235 168 121 102 77 63 45 30 13 25 3 S12000008 122,767 1,348 1,372 1,387 1,319 1,336 1,292 1,316 1,305 1,202 1,279 1,295 1,371 1,375 ... 343 291 268 219 226 151 112 78 49 48 40 27 13 14 11 4 S12000010 99,717 1,155 1,139 1,223 1,228 1,180 1,162 1,126 1,155 1,023 1,075 1,125 1,164 1,174 ... 328 294 250 217 191 159 77 69 63 37 47 32 18 8 23 5 S12000011 90,574 944 957 988 1,041 1,054 1,044 1,163 1,100 1,109 1,142 1,150 1,169 1,250 ... 307 286 251 219 198 142 81 73 55 45 35 26 23 14 19 6 S12000013 27,684 235 243 277 287 312 277 268 302 301 265 304 279 325 ... 108 114 94 91 69 52 27 21 17 13 14 15 8 4 6 7 S12000014 155,990 1,786 1,961 1,903 1,863 1,826 1,802 1,740 1,697 1,602 1,570 1,673 1,696 1,818 ... 437 396 290 292 249 184 108 78 71 55 43 19 24 12 20 8 S12000015 365,198 4,116 4,311 4,213 4,122 4,013 3,957 3,991 3,788 3,649 3,771 3,814 3,874 4,135 ... 1,149 1,041 858 715 725 493 326 250 183 189 133 88 61 36 63 9 S12000017 232,132 2,457 2,424 2,645 2,652 2,527 2,442 2,534 2,481 2,456 2,495 2,589 2,498 2,819 ... 743 666 599 490 458 302 214 175 137 100 96 48 28 33 47 10 S12000018 81,485 811 833 846 885 829 830 853 800 790 739 845 877 920 ... 231 242 211 172 164 108 100 56 50 38 25 17 17 13 10 11 S12000019 83,187 951 995 961 1,032 939 982 971 924 872 932 983 1,014 1,045 ... 225 205 170 145 131 99 58 44 39 32 23 14 17 6 9 12 S12000020 93,295 987 1,057 1,073 1,125 1,004 986 995 926 997 983 981 1,107 1,121 ... 306 239 222 187 190 127 89 74 48 39 39 20 17 8 13 13 S12000021 138,146 1,462 1,476 1,480 1,499 1,503 1,473 1,453 1,410 1,444 1,450 1,489 1,502 1,597 ... 409 362 350 283 282 197 108 86 81 57 50 35 43 12 24 14 S12000023 21,349 209 205 231 220 242 213 199 194 202 196 218 227 246 ... 70 60 54 52 44 21 22 23 18 15 5 5 10 4 7 15 S12000024 146,652 1,468 1,456 1,451 1,533 1,409 1,448 1,461 1,428 1,487 1,483 1,649 1,691 1,666 ... 556 520 439 403 300 239 146 131 96 80 84 46 30 22 36 16 S12000026 113,870 1,104 1,099 1,153 1,188 1,168 1,124 1,216 1,182 1,154 1,229 1,188 1,259 1,247 ... 476 408 328 306 285 207 127 81 70 45 54 37 22 13 24 17 S12000027 23,167 257 305 279 239 309 266 267 269 262 262 270 311 286 ... 71 60 51 50 44 31 15 22 12 9 9 10 3 2 3 18 S12000028 112,799 1,117 1,057 1,087 1,088 1,163 1,060 1,094 1,096 1,056 1,106 1,157 1,180 1,286 ... 465 408 348 317 269 201 128 93 92 63 52 41 32 9 18 19 S12000029 313,830 3,415 3,461 3,484 3,537 3,504 3,386 3,308 3,314 3,209 3,269 3,393 3,566 3,568 ... 846 774 662 596 515 378 228 187 142 127 101 58 40 30 45 20 S12000030 90,247 828 899 875 868 945 942 973 1,010 937 928 1,010 1,114 1,142 ... 290 233 199 165 162 126 75 72 57 39 29 26 17 12 13 21 S12000033 222,793 2,469 2,489 2,238 2,218 2,098 1,936 1,898 1,799 1,790 1,749 1,782 1,914 1,841 ... 605 548 475 401 392 274 141 124 110 82 59 39 35 22 39 22 S12000034 252,973 2,880 3,182 3,085 2,979 3,012 2,843 2,854 2,778 2,803 2,730 2,828 3,031 3,064 ... 753 690 543 516 422 341 187 155 133 93 87 59 26 27 36 23 S12000035 88,166 767 872 843 785 840 865 879 879 823 866 906 889 1,035 ... 336 291 262 230 211 164 102 63 58 57 49 31 16 15 32 24 S12000036 476,626 5,753 5,440 5,330 5,035 4,605 4,407 4,085 4,057 3,935 4,010 4,044 4,101 4,231 ... 1,421 1,321 1,129 1,032 904 627 438 293 279 227 182 114 85 57 93 25 S12000038 174,908 1,971 1,936 1,924 1,917 1,888 1,856 1,927 1,783 1,694 1,818 1,904 1,971 1,909 ... 467 422 356 360 305 231 126 108 116 79 52 42 28 19 15 26 S12000039 90,720 1,017 1,069 1,044 1,096 1,010 942 1,009 920 918 871 892 1,040 991 ... 257 222 179 163 164 127 78 65 49 47 31 19 6 11 11 27 S12000040 175,118 2,258 2,286 2,332 2,390 2,299 2,208 2,213 2,117 2,061 2,123 2,086 2,074 2,214 ... 308 325 228 224 170 131 89 78 49 40 30 20 16 13 20 28 S12000041 115,978 1,130 1,179 1,222 1,263 1,173 1,203 1,214 1,260 1,187 1,232 1,266 1,306 1,397 ... 441 395 347 290 232 204 118 93 80 72 51 29 23 15 25 29 S12000042 147,268 1,599 1,622 1,558 1,557 1,455 1,416 1,383 1,406 1,306 1,344 1,359 1,444 1,453 ... 516 486 392 309 282 209 121 69 86 67 58 25 28 13 19 30 S12000044 337,727 3,892 4,211 4,159 4,146 4,141 3,936 4,067 3,917 3,772 3,782 3,863 3,961 4,190 ... 753 658 503 515 404 302 186 130 122 88 76 39 49 28 26 31 S12000045 105,026 990 974 1,063 1,073 1,052 1,154 1,115 1,173 1,062 1,176 1,172 1,249 1,304 ... 369 319 237 248 180 144 84 71 63 40 27 20 18 8 17 32 S12000046 593,245 7,237 6,908 6,467 6,396 5,965 5,670 5,572 5,334 5,215 5,268 5,706 5,711 5,720 ... 1,592 1,306 1,174 1,105 872 718 417 312 291 225 168 119 71 42 101 [33 rows x 103 columns] ``` Seemingly fine so far, as there is data! ### Misc data wrangling ```python raw_data.replace("-", 0, inplace=True) raw_data.replace(",", "", inplace=True, regex=True) # assumes the first n are (unnamed) columns we don't want to melt, geography coming first: n = geog + num categories - 1 (the one to melt) lookup = raw_data.columns.tolist()[len(meta["fields"]):] id_vars = ["GEOGRAPHY_CODE"] for i in range(1,len(meta["fields"])): id_vars.append(table + "_" + str(i) + "_CODE") cols = id_vars.copy() cols.extend(list(range(0,len(lookup)))) raw_data.columns = cols raw_data = raw_data.melt(id_vars=id_vars) id_vars.extend([table + "_0_CODE", "OBS_VALUE"]) raw_data.columns = id_vars # ensure OBS_VALUE is numeric raw_data["OBS_VALUE"] = pd.to_numeric(raw_data["OBS_VALUE"]) # convert categories to numeric values for i in range(1,len(meta["fields"])): category_name = raw_data.columns[i] category_values = meta["fields"][category_name] # make sure metadata has same no. of categories assert len(category_values) == len(raw_data[category_name].unique()) category_map = { k: v for v, k in enumerate(category_values)} raw_data[category_name] = raw_data[category_name].map(category_map) # geography (and category_filter) must be lists if isinstance(geography, str): geography = [geography] ``` Results in the data table now looking: ```python In [2]: raw_data Out[2]: GEOGRAPHY_CODE QS103SC.csv_0_CODE OBS_VALUE 0 S92000003 0 5295403 1 S12000005 0 51442 2 S12000006 0 151324 3 S12000008 0 122767 4 S12000010 0 99717 ... ... ... ... 3361 S12000041 101 25 3362 S12000042 101 19 3363 S12000044 101 26 3364 S12000045 101 17 3365 S12000046 101 101 [3366 rows x 3 columns] ``` ### The true problem ```python data = raw_data[raw_data.GEOGRAPHY_CODE.isin(geography)] ``` this creates the filtered empty `data` becuse `GEOGRAPHY_CODE` and the values in `geography` are not of the same format ## `geography` This variable is defined by ```python def get_geog(self, coverage, resolution): """ Returns all areas at resolution in coverage """ # assumes all areas in coverage are the same type coverage_type = _coverage_type(coverage) if coverage_type == "ALL": return self.area_lookup[resolution].unique() # ensure list if isinstance(coverage, str): coverage = [coverage] return self.area_lookup[self.area_lookup[coverage_type].isin(coverage)][resolution].unique() ``` Where this `area_lookup` table is referenced and is defined in the class constructor ```python def __init__(self, cache_dir): """Constructor. Args: cache_dir: cache directory Returns: an instance. """ # checks exists and is writable, creates if necessary self.cache_dir = utils.init_cache_dir(cache_dir) self.offline_mode = not utils.check_online(self.URL1) if self.offline_mode: print("Unable to contact %s, operating in offline mode - pre-cached data only" % self.URL1) # download the lookup if not present lookup_file = self.cache_dir / "sc_lookup.csv" if not os.path.isfile(str(lookup_file)): self.make_sc_lookup() self.area_lookup = pd.read_csv(str(self.cache_dir / "sc_lookup.csv")) # TODO use a map (just in case col order changes) #self.area_lookup = self.area_lookup.rename(columns={"OutputArea", "DataZone", "InterZone", "Council":"LSOA11"}) self.area_lookup.columns = ["OA11", "LSOA11", "MSOA11", "LAD"] # This maybe broken for scotland ``` ### `sc_lookup.csv` Is it possible this CSV and the presumed column order being set to it have shifted? ```python ["OA11", "LSOA11", "MSOA11", "LAD"] ``` In this csv we have ```csvpreview OutputArea,DataZone,InterZone,Council S00088956,S01006732,S02001275,S12000033 S00088957,S01006732,S02001275,S12000033 S00088958,S01006732,S02001275,S12000033 S00088959,S01006737,S02001275,S12000033 S00088960,S01006518,S02001238,S12000033 ``` Which indicates that `Council` or `LAD` columns are matching to `raw_data` and that `LSOA11` or `DataZone` are not present. So perhaps `raw_data` is incorrectly made? ### `__get_rawdata` ```python def __get_rawdata(self, table, resolution): """ Gets the raw csv data and metadata """ if not os.path.exists(os.path.join(str(self.cache_dir), table + ".csv")): print("Downloading...") try: zf = str(self.__source_to_zip(self.data_sources[self.GeoCodeLookup[resolution]])) print("File saved to: ", zf) with zipfile.ZipFile(zf) as zip_ref: print("Extracting downloaded zip...") zip_ref.extractall(str(self.cache_dir)) except NotImplementedError as e: print("Unsupported zip type... running unzip via shell...") subprocess.run(['unzip', zf, '-d', str(self.cache_dir)]) else: print("Data already chached at: " + str(os.path.join(str(self.cache_dir), table + ".csv"))) raw_data = pd.read_csv(os.path.join(str(self.cache_dir), table + ".csv")) # more sophisticate way to check for no data? if raw_data.shape == (2,1): raise ValueError("Table {}: data not available at {} resolution.".format(table, resolution)) # assumes: # - first column is geography (unnamed) # - any subsequent columns are categorical # - named columns are categories raw_cols = raw_data.columns.tolist() fields = {} col_index = 1 while raw_cols[col_index][:8] == "Unnamed:": # lists format better than numpy arrays fields[table + "_" + str(col_index) + "_CODE"] = raw_data[raw_cols[col_index]].unique().tolist() col_index = col_index + 1 categories = raw_data.columns.tolist()[col_index:] fields[table + "_0_CODE"] = dict(zip(range(len(categories)), categories)) meta = { "table": table, "description": "", "geography": resolution, "fields": fields } return (meta, raw_data) ``` Looking into the table downloaded here ```csvpreview ,All people,Males,Females S92000003,"5,295,403","2,567,444","2,727,959" S12000005,"51,442","25,109","26,333" S12000006,"151,324","73,405","77,919" S12000008,"122,767","59,389","63,378" S12000010,"99,717","47,873","51,844" S12000011,"90,574","43,165","47,409" ``` it would seem that `LAD` codes are used?