Tech Labs | 14 Jun 2022

API Databank – Downloading Entire Databanks

Technology Team

Oxford Economics

In most circumstances, narrowly-defined selections are the best and quickest method to download data with the API. However, in some cases it may be necessary to fetch everything all at once. The way to do this is nearly identical to any other download but for the following two conditions:

A simple example Python implementation, using a custom selection:

#
# Copyright (c) 2022 Oxford Economics Ltd. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the
# project root for full license information.
#

import json
import requests

API_KEY = # insert api key
BASE_URL = ‘https://services.oxfordeconomics.com’

sample_selection = {
    ‘DatabankCode’: ‘WDMacro’,
    ‘Frequency’: ‘Annual’,
    ‘GroupingMode’: ‘false’,
    ‘IndicatorSortOrder’: ‘AlphabeticalOrder’,
    ‘IsTemporarySelection’: ‘true’,
    ‘ListingType’: ‘Private’,
    ‘LocationSortOrder’: ‘AlphabeticalOrder’,
    ‘Order’: ‘IndicatorLocation’,
    ‘Sequence’: ‘EarliestToLatest’,
    ‘StackedQuarters’: ‘false’,
    ‘StartYear’: 1980,
    ‘EndYear’: 2045,
    # note: the fields below have been assigned empty lists
    ‘Regions’: [
    ],
    ‘Variables’: [
    ]
}

def databank_download(selection_dictionary):
    headers = {‘Accept’: ‘application/json’,
        ‘Api-Key’: API_KEY,
        ‘Content-Type’: ‘application/json; charset=utf8’ }
    def _download_url(base_url, page, pagesize):
        url = base_url + ‘/api/download?includemetadata=true’
        return url + ‘&page={1}+&pagesize={2}’.format(base_url, page, page_size)
    page = 0
    page_size = 5000
    data = None
    while True:
        # note: _download_url returns a link of the form
        # ../api/download?includemetadata=true&page={page}+&page_size={page_size}
        response = requests.post(_download_url(BASE_URL, page, page_size),
            headers=headers,
            data=json.dumps(selection_dictionary))
        new_data = response.json()
        page += 1
        if data is None:
            data = new_data
        else:
            data.extend(new_data)
        if len(new_data) < page_size:
            break
    return data

if __name__ == ‘__main__’:
    data = databank_download(sample_selection)
    data_file = open(‘data_file.tmp’, ‘w’)
    json.dump(data, data_file)

A more advanced example Python implementation, using a saved selection ID:
#
# Copyright (c) 2017 Oxford Economics Ltd. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the
# project root for full license information.
#

import sys
import json
import requests
import time

# WARNING: In production these values should be provided via environment or command line args
API_KEY = ‘<<ENTER API KEY HERE>>’
SELECTION = {‘ID’: ‘<<ENTER SELECTION ID HERE>>’, ‘NAME’: ‘<<ENTER NAME OF SELECTION HERE>>’}
PAGE_SIZE = 20000   # set to number of records per page (20k is the max allowed by API)
MAX_PAGES = None    # set to None for as many pages as possible
CHECKPOINT = 5      # checkpoint (save) after this many pages

def databank_download(f, selection_id, show_progress=True):
    BASE_URL = ‘https://services.oxfordeconomics.com’
    INCLUDE_METADATA=’false’ # or set = ‘true’

    headers = { ‘Accept’: ‘application/json’,
                ‘Api-Key’: API_KEY,
                ‘Content-Type’: ‘application/json; charset=utf8’ }

    def _download_url(base_url, page, pagesize):
        return  f'{base_url}/api/download/{selection_id}?includemetadata={INCLUDE_METADATA}&page={page}&pagesize={page_size}’
    page = 0
    page_size = PAGE_SIZE
    max_pages = MAX_PAGES
    data = None
    api_request_time_total = 0.0
    disk_write_time_total = 0.0

    while True:
        # download selection_id with some escalating wait retry logic
        max_retries = 5
        retries = 0
        while retries < max_retries:
            try:
                start = time.time()
                response = requests.get(_download_url(BASE_URL, page, page_size), headers=headers)
                end = time.time()
                api_request_time_total += (end – start)
                break
            except ConnectionError as msg:
                print(f’ConnectionError Exception!\n{msg}’)
                retries += 1
                print(f’Waiting to retry ({retries}/5) in {retries*5} seconds…\n{msg}’)
                time.sleep(retries*5)
                pass
            except TimeoutError as msg:
                print(f’ConnectionError Exception!\n{msg}’)
                retries += 1
                print(f’Waiting to retry ({retries}/5) in {retries*5} seconds…\n{msg}’)
                time.sleep(retries*5)
                pass
            except Exception as msg:
                print(f’Exception!\n{msg}’)
                raise RuntimeError(‘Something bad happened!’)
        if retries == max_retries:
            return page, api_request_time_total, disk_write_time_total
        page_data = response.json()
        page += 1
        if show_progress:
            sys.stdout.write(‘#’)
            sys.stdout.flush()

        # accumulate data
        if data is None:
             data = page_data
        else:
             data.extend(page_data)
        # dump to file every so many pages
        if page % CHECKPOINT == 0 and data != None:
            sys.stdout.write(‘+’)
            sys.stdout.flush()
            start = time.time()
            json.dump(data, f)
            end = time.time()
            disk_write_time_total += (end – start)
            data = None

        # get all pages or limit to max_pages
        if max_pages == None:
            pass
        elif page == max_pages:
            break
        # have we got to the end
        if len(page_data) < page_size:
            break

    # dump the last chunk of data, if any
    if not data is None:
        start = time.time()
        json.dump(data, f)
        end = time.time()
        disk_write_time_total += (end – start)
    return page, api_request_time_total, disk_write_time_total

if __name__ == ‘__main__’:
    SELECTION_ID = SELECTION[‘ID’]
    DATA_FILE = SELECTION[‘NAME’] + ‘.tmp’
    start = time.asctime(time.localtime(time.time()))
    print(f’Started @ {start} | DATA_FILE={DATA_FILE}, PAGE_SIZE={PAGE_SIZE}, MAX_PAGES={MAX_PAGES}, CHECKPOINT={CHECKPOINT}’)
    with open(DATA_FILE, ‘w’) as f:
        pages, api_request_time_total, disk_write_time_total = databank_download(f, SELECTION_ID)
    end = time.asctime(time.localtime(time.time()))
    print(f’\nCOMPLETE! Downloaded {pages} pages’)
    print(f’Started @ {start}’)
    print(f’Ended @ {end}’)
    print(‘Total API Request Time: %.1f secs, %.1f mins’ % (api_request_time_total, api_request_time_total/60))
    print(‘Total Disk Write Time: %.1f secs, %.1f mins’ % (disk_write_time_total, disk_write_time_total/60))
”’

Please note:

If you’re using vscode to develop/debug this program and it encounters an SSL error, then do the following:

#     mklink “<YOUR ANACONDA LOCATION>\Anaconda3\DLLs\libcrypto-1_1-x64.dll” “<YOUR ANACONDA LOCATION>\Anaconda3\Library\bin\libcrypto-1_1-x64.dll”
#     mklink “<YOUR ANACONDA LOCATION>\Anaconda3\DLLs\libssl-1_1-x64.dll” “<YOUR ANACONDA LOCATION>\Anaconda3\Library\bin\libssl-1_1-x64.dll”

You can also physically copy them over.

The exception you may see will be something like this:

#     HTTPSConnectionPool(host=’services.oxfordeconomics.com’, port=443):
#     Max retries exceeded with url: /api/download/…?includemetadata=false&page=0+&pagesize=2000
#     (Caused by SSLError(“Can’t connect to HTTPS URL because the SSL module is not available.”))

Related Services

Post

From opportunity to impact – Assessing the economic, societal, and cultural benefits of YouTube in Finland

This study assesses YouTube's contribution to GDP and employment in Finland, and its broader impact on society and culture.

Find Out More

Post

Mahdollisuuksista vaikuttavuuteen – Arviointi YouTuben taloudellisista, yhteiskunnallisista ja kulttuurisista hyödyistä Suomessa

Tässä tutkimuksessa arvioidaan YouTuben vaikutusta Suomen bruttokansantuotteeseen ja työllisyyteen sekä sen laajempaa vaikutusta yhteiskuntaan ja kulttuuriin.

Find Out More

Post

Economic Impacts of The Umbrella Academy Season Two in Ontario

In 2019, production of Season Two of “The Umbrella Academy” resulted in total spending in Ontario of over C$77.3 million.

Find Out More