Module destination_based_sales.oecd_cbcr

This module is used to load and preprocess aggregated and anonymized country-by-country data from the OECD. These pro- vide the three revenue variables that we aim at distributing based on their approximative ultimate destination for head- quarter countries other than the US.

Expand source code
"""
This module is used to load and preprocess aggregated and anonymized country-by-country data from the OECD. These pro-
vide the three revenue variables that we aim at distributing based on their approximative ultimate destination for head-
quarter countries other than the US.
"""


########################################################################################################################
# --- Imports

import os

import numpy as np
import pandas as pd

import requests

from destination_based_sales.utils import impute_missing_continent_codes, CONTINENT_CODES_TO_IMPUTE_OECD_CBCR, \
    UK_CARIBBEAN_ISLANDS


########################################################################################################################
# --- Diverse

path_to_dir = os.path.dirname(os.path.abspath(__file__))

path_to_geographies = os.path.join(path_to_dir, 'data', 'geographies.csv')
path_to_GNI_data = os.path.join(path_to_dir, 'data', 'gross_national_income.csv')
path_to_tax_haven_list = os.path.join(path_to_dir, 'data', 'tax_havens.csv')
local_path_to_OECD_CbCR_data = os.path.join(path_to_dir, 'data', 'oecd_cbcr.csv')


########################################################################################################################
# --- Content

class CbCRPreprocessor:

    def __init__(
        self,
        load_raw_data=True,
        path_to_geographies=path_to_geographies,
        continent_code_imputations=CONTINENT_CODES_TO_IMPUTE_OECD_CBCR,
        path_to_GNI_data=path_to_GNI_data,
        path_to_tax_haven_list=path_to_tax_haven_list,
        fetch_data_online=False
    ):
        """
        The instructions allowing to load and preprocess OECD data are organised in a Python class, CbCRPreprocessor.

        This is the instantiation function for this class. It requires several arguments:

        - "load_raw_data": a boolean indicating whether or not to directly load the data in a class attribute;

        - "path_to_geographies": the string path to the "geographies.csv" file, used for instance to complement the
        OECD's country-by-country data with country codes;

        - "continent_code_imputations": a dictionary with the OECD codes (for continental aggregates in particular) as
        keys and the codes that we use throughout our computations as values;

        - "path_to_GNI_data": a string path to the "gross_national_income.csv" file, used to build comparisons between
        the distribution of revenue variables and that of Gross National Income (GNI);

        - "path_to_tax_haven_list": a string path to the "tax_havens.csv" file, that contains the list of tax havens
        compiled by Tørsløv, Wier and Zucman (2019);

        - "fetch_data_online": a boolean indicating whether to fetch the country-by-country data online (if set to True)
        or locally from the "data" folder (if set to False).
        """
        if fetch_data_online:
            # If relevant, we construct the URL from which we can load the CSV country-by-country dataset
            self.url_base = 'http://stats.oecd.org/SDMX-JSON/data/'
            self.dataset_identifier = 'CBCR_TABLEI/'
            self.dimensions = 'ALL/'
            self.agency_name = 'OECD'

            self.path_to_OECD_data = (
                self.url_base + self.dataset_identifier + self.dimensions + self.agency_name + '?contenttype=csv'
            )

        else:
            # Or we use the path to the local file
            self.path_to_OECD_CbCR_data = local_path_to_OECD_CbCR_data

        self.path_to_geographies = path_to_geographies
        self.continent_code_imputations = continent_code_imputations

        self.path_to_GNI_data = path_to_GNI_data
        self.path_to_tax_haven_list = path_to_tax_haven_list

        # If relevant, we load the data in a dedicated class attribute
        if load_raw_data:
            if fetch_data_online:
                print("Fetching the OECD's aggregated and anonymized CbCR data - This may take up to 30 seconds.")

            self.data = pd.read_csv(self.path_to_OECD_CbCR_data)

            if fetch_data_online:
                print("Loaded the OECD's aggregated and anonymized CbCR data successfully.")

        else:
            self.data = None

    def load_raw_data(self):
        """
        This class method allows to load the OECD's anonymized and aggregated data from the pre-constructed string path
        (see the instantiation function above), in case it has not been done in the instantiation function (after having
        set load_raw_data=False). Data are stored in a dedicated class attribute.
        """
        self.data = pd.read_csv(self.path_to_OECD_CbCR_data)

    def get_preprocessed_revenue_data(self):
        """
        This class method, relying on the pre-loaded data, applies a series of preprocessing steps to the raw data. The
        latter has indeed been loaded from the complete CSV file with a lot of non-relevant fields and a specific orga-
        nisation. In particular, we pivot the DataFrame to move from a long to a wide dataset.
        """
        if self.data is None:
            raise Exception('You must load the data with the dedicated method before you may run any computation.')

        cbcr = self.data.copy()

        # We focus on the positive-profit sub-sample only
        cbcr = cbcr[cbcr['PAN'] == 'PANELA'].copy()

        # And on the 2016 year
        cbcr = cbcr[cbcr['Year'] == 2016].copy()

        cbcr.drop(
            columns=['PAN', 'Grouping', 'Flag Codes', 'Flags', 'YEA', 'Year'],
            inplace=True
        )

        # We reshape the DataFrame from long into wide format, columns corresponding to the various financial variables
        cbcr_wide = cbcr.pivot(
            index=['COU', 'Ultimate Parent Jurisdiction', 'JUR', 'Partner Jurisdiction'],
            columns='Variable',
            values='Value'
        ).reset_index()

        # We limit ourselves to the relevant variables
        oecd = cbcr_wide[
            [
                'COU', 'Ultimate Parent Jurisdiction', 'JUR', 'Partner Jurisdiction',
                'Related Party Revenues', 'Total Revenues', 'Unrelated Party Revenues'
            ]
        ].copy()

        oecd.rename(
            columns={
                'COU': 'PARENT_COUNTRY_CODE',
                'Ultimate Parent Jurisdiction': 'PARENT_COUNTRY_NAME',
                'JUR': 'AFFILIATE_COUNTRY_CODE',
                'Partner Jurisdiction': 'AFFILIATE_COUNTRY_NAME',
                'Related Party Revenues': 'RELATED_PARTY_REVENUES',
                'Total Revenues': 'TOTAL_REVENUES',
                'Unrelated Party Revenues': 'UNRELATED_PARTY_REVENUES'
            },
            inplace=True
        )

        oecd.rename_axis(None, axis=1, inplace=True)

        # We eliminate Stateless entities
        oecd = oecd[oecd['AFFILIATE_COUNTRY_NAME'] != 'Stateless'].copy()

        # We eliminate countries with minimum reporting (only a domestic / foreign split)
        mask = oecd['PARENT_COUNTRY_CODE'].map(
            lambda code: (oecd['PARENT_COUNTRY_CODE'] == code).sum()
        ) > 2
        oecd = oecd[mask].copy()

        # And we can now eliminate Foreign Jurisdictions Total rows
        oecd = oecd[oecd['AFFILIATE_COUNTRY_NAME'] != 'Foreign Jurisdictions Total'].copy()

        # We group the rows corresponding to UK Caribbean Islands
        oecd['AFFILIATE_COUNTRY_CODE'] = oecd['AFFILIATE_COUNTRY_CODE'].map(
            lambda country_code: country_code if country_code not in UK_CARIBBEAN_ISLANDS else 'UKI'
        )
        oecd['AFFILIATE_COUNTRY_NAME'] = oecd.apply(
            (
                lambda row: row['AFFILIATE_COUNTRY_NAME']
                if row['AFFILIATE_COUNTRY_CODE'] != 'UKI' else 'UK Caribbean Islands'
            ),
            axis=1
        )
        oecd = oecd.groupby(
            ['PARENT_COUNTRY_CODE', 'PARENT_COUNTRY_NAME', 'AFFILIATE_COUNTRY_CODE', 'AFFILIATE_COUNTRY_NAME']
        ).sum().reset_index()

        # We add continent codes to the dataset from the "geographies.csv" file
        geographies = pd.read_csv(self.path_to_geographies)

        oecd = oecd.merge(
            geographies[['CODE', 'CONTINENT_CODE']].drop_duplicates(),
            how='left',
            left_on='AFFILIATE_COUNTRY_CODE', right_on='CODE'
        )

        oecd.drop(columns=['CODE'], inplace=True)

        oecd['CONTINENT_CODE'] = oecd.apply(
            lambda row: impute_missing_continent_codes(row, self.continent_code_imputations),
            axis=1
        )

        # We limit ourselves to 4 continents (Europe, Africa, America and Asia-Pacific)
        oecd['CONTINENT_CODE'] = oecd['CONTINENT_CODE'].map(
            lambda x: 'APAC' if x in ['ASIA', 'OCN'] or x is None else x
        )

        oecd['CONTINENT_CODE'] = oecd['CONTINENT_CODE'].map(
            lambda x: 'AMR' if x in ['SAMR', 'NAMR'] else x
        )

        # In 2016, one row (India - Bouvet Island) which is full of 0s; we eliminate it
        oecd = oecd[oecd['CONTINENT_CODE'] != 'ATC'].copy()

        return oecd.copy()

    def get_revenue_data_with_tax_havens(self):
        """
        Building upon the "get_preprocessed_revenue_data" method, we add an indicator variable - IS_TAX_HAVEN - valued
        to 1 if the country code is identified in the TWZ list or to 0 if not.
        """
        oecd = self.get_preprocessed_revenue_data()

        tax_havens = pd.read_csv(self.path_to_tax_haven_list)

        oecd['IS_TAX_HAVEN'] = oecd['AFFILIATE_COUNTRY_CODE'].isin(tax_havens['CODE'].unique())

        return oecd.copy()

    def get_revenue_data_with_GNI(self):
        """
        Building upon the "get_revenue_data_with_tax_havens" method, we add the Gross National Income (GNI) variables
        and return the resulting dataset.
        """
        oecd = self.get_revenue_data_with_tax_havens()

        # We load the data from the CSV file
        gross_national_income = pd.read_csv(self.path_to_GNI_data, delimiter=';')

        # All numeric columns in the file must be slightly preprocessed to be considered as floats
        for column in gross_national_income.columns[2:]:

            gross_national_income[column] = gross_national_income[column].map(
                lambda x: x.replace(',', '.') if isinstance(x, str) else x
            )

            gross_national_income[column] = gross_national_income[column].astype(float)

        # We add GNI variables to the main dataset
        oecd = oecd.merge(
            gross_national_income[['COUNTRY_CODE', 'GNI_2016']].copy(),
            how='left',
            left_on='AFFILIATE_COUNTRY_CODE', right_on='COUNTRY_CODE'
        )

        oecd.drop(columns=['COUNTRY_CODE'], inplace=True)

        return oecd.copy()

    def get_scatterplot_data(self, breakdown_threshold):
        """
        This method allows to output the relevant data for building the scatterplots showing, for each parent jurisdi-
        ction the relationship between partner jurisdictions’ share of multinational companies’ foreign unrelated-party
        revenues and their share of the Gross National Income (GNI) in the dataset.
        """
        oecd = self.get_revenue_data_with_GNI()

        # We eliminate foreign jurisdictions totals and rows corresponding to domestic activities
        oecd = oecd[oecd['AFFILIATE_COUNTRY_NAME'] != 'Foreign Jurisdictions Total'].copy()
        oecd = oecd[oecd['PARENT_COUNTRY_CODE'] != oecd['AFFILIATE_COUNTRY_CODE']].copy()

        # The "NB_AFFILIATE_COUNTRIES" column shows the number of partners reported by the corresponding parent country
        oecd['NB_AFFILIATE_COUNTRIES'] = oecd['PARENT_COUNTRY_CODE'].map(
            lambda code: (oecd['PARENT_COUNTRY_CODE'] == code).sum()
        )

        # We only show the scatterplot for parent countries that report a minimum number of partner jurisdictions
        oecd_restricted = oecd[oecd['NB_AFFILIATE_COUNTRIES'] >= breakdown_threshold].copy()

        return oecd_restricted.dropna()

    def show_scatterplots(self, breakdown_threshold=60):
        """
        Building upon the "get_scatterplot_data" method, this method allows to output the scatterplots showing, for each
        parent jurisdiction in the OECD's country-by-country statistics, the relationship between partner jurisdictions’
        share of multinational companies’ foreign unrelated-party revenues and their share of the Gross National Income
        (GNI) in the dataset.
        """

        # We fetch the data to be displayed within the scatterplots thanks to the dedicated method
        oecd_restricted = self.get_scatterplot_data(breakdown_threshold=breakdown_threshold)

        # The number of parent countries determines the number of graphs to construct
        nb_countries = oecd_restricted['PARENT_COUNTRY_CODE'].nunique()

        ncols = 2
        nrows = int(nb_countries / ncols) + 1

        fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(17, 50))

        # We plot the graph for each parent country in the preprocessed country-by-country statistics
        for parent_country, ax in zip(oecd_restricted['PARENT_COUNTRY_CODE'].unique(), axes.flatten()):

            restricted_df = oecd_restricted[oecd_restricted['PARENT_COUNTRY_CODE'] == parent_country].copy()

            restricted_df['SHARE_OF_UNRELATED_PARTY_REVENUES'] = (
                restricted_df['UNRELATED_PARTY_REVENUES'] / restricted_df['UNRELATED_PARTY_REVENUES'].sum()
            )

            restricted_df['SHARE_OF_GNI_2016'] = (
                restricted_df['GNI_2016'] / restricted_df['GNI_2016'].sum()
            )

            sns.scatterplot(
                x='SHARE_OF_GNI_2016',
                y='SHARE_OF_UNRELATED_PARTY_REVENUES',
                hue='IS_TAX_HAVEN',
                data=restricted_df,
                ax=ax
            )

            ax.set_title(parent_country)

        plt.show()

Classes

class CbCRPreprocessor (load_raw_data=True, path_to_geographies='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales/data/geographies.csv', continent_code_imputations={'OAF': 'AFR', 'OAM': 'AMR', 'OAS': 'APAC', 'OTE': 'EUR', 'AFRIC': 'AFR', 'AMER': 'AMR', 'ASIAT': 'APAC', 'EUROP': 'EUR', 'GRPS': 'OTHER_GROUPS', 'UKI': 'AMR'}, path_to_GNI_data='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales/data/gross_national_income.csv', path_to_tax_haven_list='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales/data/tax_havens.csv', fetch_data_online=False)

The instructions allowing to load and preprocess OECD data are organised in a Python class, CbCRPreprocessor.

This is the instantiation function for this class. It requires several arguments:

  • "load_raw_data": a boolean indicating whether or not to directly load the data in a class attribute;

  • "path_to_geographies": the string path to the "geographies.csv" file, used for instance to complement the OECD's country-by-country data with country codes;

  • "continent_code_imputations": a dictionary with the OECD codes (for continental aggregates in particular) as keys and the codes that we use throughout our computations as values;

  • "path_to_GNI_data": a string path to the "gross_national_income.csv" file, used to build comparisons between the distribution of revenue variables and that of Gross National Income (GNI);

  • "path_to_tax_haven_list": a string path to the "tax_havens.csv" file, that contains the list of tax havens compiled by Tørsløv, Wier and Zucman (2019);

  • "fetch_data_online": a boolean indicating whether to fetch the country-by-country data online (if set to True) or locally from the "data" folder (if set to False).

Expand source code
class CbCRPreprocessor:

    def __init__(
        self,
        load_raw_data=True,
        path_to_geographies=path_to_geographies,
        continent_code_imputations=CONTINENT_CODES_TO_IMPUTE_OECD_CBCR,
        path_to_GNI_data=path_to_GNI_data,
        path_to_tax_haven_list=path_to_tax_haven_list,
        fetch_data_online=False
    ):
        """
        The instructions allowing to load and preprocess OECD data are organised in a Python class, CbCRPreprocessor.

        This is the instantiation function for this class. It requires several arguments:

        - "load_raw_data": a boolean indicating whether or not to directly load the data in a class attribute;

        - "path_to_geographies": the string path to the "geographies.csv" file, used for instance to complement the
        OECD's country-by-country data with country codes;

        - "continent_code_imputations": a dictionary with the OECD codes (for continental aggregates in particular) as
        keys and the codes that we use throughout our computations as values;

        - "path_to_GNI_data": a string path to the "gross_national_income.csv" file, used to build comparisons between
        the distribution of revenue variables and that of Gross National Income (GNI);

        - "path_to_tax_haven_list": a string path to the "tax_havens.csv" file, that contains the list of tax havens
        compiled by Tørsløv, Wier and Zucman (2019);

        - "fetch_data_online": a boolean indicating whether to fetch the country-by-country data online (if set to True)
        or locally from the "data" folder (if set to False).
        """
        if fetch_data_online:
            # If relevant, we construct the URL from which we can load the CSV country-by-country dataset
            self.url_base = 'http://stats.oecd.org/SDMX-JSON/data/'
            self.dataset_identifier = 'CBCR_TABLEI/'
            self.dimensions = 'ALL/'
            self.agency_name = 'OECD'

            self.path_to_OECD_data = (
                self.url_base + self.dataset_identifier + self.dimensions + self.agency_name + '?contenttype=csv'
            )

        else:
            # Or we use the path to the local file
            self.path_to_OECD_CbCR_data = local_path_to_OECD_CbCR_data

        self.path_to_geographies = path_to_geographies
        self.continent_code_imputations = continent_code_imputations

        self.path_to_GNI_data = path_to_GNI_data
        self.path_to_tax_haven_list = path_to_tax_haven_list

        # If relevant, we load the data in a dedicated class attribute
        if load_raw_data:
            if fetch_data_online:
                print("Fetching the OECD's aggregated and anonymized CbCR data - This may take up to 30 seconds.")

            self.data = pd.read_csv(self.path_to_OECD_CbCR_data)

            if fetch_data_online:
                print("Loaded the OECD's aggregated and anonymized CbCR data successfully.")

        else:
            self.data = None

    def load_raw_data(self):
        """
        This class method allows to load the OECD's anonymized and aggregated data from the pre-constructed string path
        (see the instantiation function above), in case it has not been done in the instantiation function (after having
        set load_raw_data=False). Data are stored in a dedicated class attribute.
        """
        self.data = pd.read_csv(self.path_to_OECD_CbCR_data)

    def get_preprocessed_revenue_data(self):
        """
        This class method, relying on the pre-loaded data, applies a series of preprocessing steps to the raw data. The
        latter has indeed been loaded from the complete CSV file with a lot of non-relevant fields and a specific orga-
        nisation. In particular, we pivot the DataFrame to move from a long to a wide dataset.
        """
        if self.data is None:
            raise Exception('You must load the data with the dedicated method before you may run any computation.')

        cbcr = self.data.copy()

        # We focus on the positive-profit sub-sample only
        cbcr = cbcr[cbcr['PAN'] == 'PANELA'].copy()

        # And on the 2016 year
        cbcr = cbcr[cbcr['Year'] == 2016].copy()

        cbcr.drop(
            columns=['PAN', 'Grouping', 'Flag Codes', 'Flags', 'YEA', 'Year'],
            inplace=True
        )

        # We reshape the DataFrame from long into wide format, columns corresponding to the various financial variables
        cbcr_wide = cbcr.pivot(
            index=['COU', 'Ultimate Parent Jurisdiction', 'JUR', 'Partner Jurisdiction'],
            columns='Variable',
            values='Value'
        ).reset_index()

        # We limit ourselves to the relevant variables
        oecd = cbcr_wide[
            [
                'COU', 'Ultimate Parent Jurisdiction', 'JUR', 'Partner Jurisdiction',
                'Related Party Revenues', 'Total Revenues', 'Unrelated Party Revenues'
            ]
        ].copy()

        oecd.rename(
            columns={
                'COU': 'PARENT_COUNTRY_CODE',
                'Ultimate Parent Jurisdiction': 'PARENT_COUNTRY_NAME',
                'JUR': 'AFFILIATE_COUNTRY_CODE',
                'Partner Jurisdiction': 'AFFILIATE_COUNTRY_NAME',
                'Related Party Revenues': 'RELATED_PARTY_REVENUES',
                'Total Revenues': 'TOTAL_REVENUES',
                'Unrelated Party Revenues': 'UNRELATED_PARTY_REVENUES'
            },
            inplace=True
        )

        oecd.rename_axis(None, axis=1, inplace=True)

        # We eliminate Stateless entities
        oecd = oecd[oecd['AFFILIATE_COUNTRY_NAME'] != 'Stateless'].copy()

        # We eliminate countries with minimum reporting (only a domestic / foreign split)
        mask = oecd['PARENT_COUNTRY_CODE'].map(
            lambda code: (oecd['PARENT_COUNTRY_CODE'] == code).sum()
        ) > 2
        oecd = oecd[mask].copy()

        # And we can now eliminate Foreign Jurisdictions Total rows
        oecd = oecd[oecd['AFFILIATE_COUNTRY_NAME'] != 'Foreign Jurisdictions Total'].copy()

        # We group the rows corresponding to UK Caribbean Islands
        oecd['AFFILIATE_COUNTRY_CODE'] = oecd['AFFILIATE_COUNTRY_CODE'].map(
            lambda country_code: country_code if country_code not in UK_CARIBBEAN_ISLANDS else 'UKI'
        )
        oecd['AFFILIATE_COUNTRY_NAME'] = oecd.apply(
            (
                lambda row: row['AFFILIATE_COUNTRY_NAME']
                if row['AFFILIATE_COUNTRY_CODE'] != 'UKI' else 'UK Caribbean Islands'
            ),
            axis=1
        )
        oecd = oecd.groupby(
            ['PARENT_COUNTRY_CODE', 'PARENT_COUNTRY_NAME', 'AFFILIATE_COUNTRY_CODE', 'AFFILIATE_COUNTRY_NAME']
        ).sum().reset_index()

        # We add continent codes to the dataset from the "geographies.csv" file
        geographies = pd.read_csv(self.path_to_geographies)

        oecd = oecd.merge(
            geographies[['CODE', 'CONTINENT_CODE']].drop_duplicates(),
            how='left',
            left_on='AFFILIATE_COUNTRY_CODE', right_on='CODE'
        )

        oecd.drop(columns=['CODE'], inplace=True)

        oecd['CONTINENT_CODE'] = oecd.apply(
            lambda row: impute_missing_continent_codes(row, self.continent_code_imputations),
            axis=1
        )

        # We limit ourselves to 4 continents (Europe, Africa, America and Asia-Pacific)
        oecd['CONTINENT_CODE'] = oecd['CONTINENT_CODE'].map(
            lambda x: 'APAC' if x in ['ASIA', 'OCN'] or x is None else x
        )

        oecd['CONTINENT_CODE'] = oecd['CONTINENT_CODE'].map(
            lambda x: 'AMR' if x in ['SAMR', 'NAMR'] else x
        )

        # In 2016, one row (India - Bouvet Island) which is full of 0s; we eliminate it
        oecd = oecd[oecd['CONTINENT_CODE'] != 'ATC'].copy()

        return oecd.copy()

    def get_revenue_data_with_tax_havens(self):
        """
        Building upon the "get_preprocessed_revenue_data" method, we add an indicator variable - IS_TAX_HAVEN - valued
        to 1 if the country code is identified in the TWZ list or to 0 if not.
        """
        oecd = self.get_preprocessed_revenue_data()

        tax_havens = pd.read_csv(self.path_to_tax_haven_list)

        oecd['IS_TAX_HAVEN'] = oecd['AFFILIATE_COUNTRY_CODE'].isin(tax_havens['CODE'].unique())

        return oecd.copy()

    def get_revenue_data_with_GNI(self):
        """
        Building upon the "get_revenue_data_with_tax_havens" method, we add the Gross National Income (GNI) variables
        and return the resulting dataset.
        """
        oecd = self.get_revenue_data_with_tax_havens()

        # We load the data from the CSV file
        gross_national_income = pd.read_csv(self.path_to_GNI_data, delimiter=';')

        # All numeric columns in the file must be slightly preprocessed to be considered as floats
        for column in gross_national_income.columns[2:]:

            gross_national_income[column] = gross_national_income[column].map(
                lambda x: x.replace(',', '.') if isinstance(x, str) else x
            )

            gross_national_income[column] = gross_national_income[column].astype(float)

        # We add GNI variables to the main dataset
        oecd = oecd.merge(
            gross_national_income[['COUNTRY_CODE', 'GNI_2016']].copy(),
            how='left',
            left_on='AFFILIATE_COUNTRY_CODE', right_on='COUNTRY_CODE'
        )

        oecd.drop(columns=['COUNTRY_CODE'], inplace=True)

        return oecd.copy()

    def get_scatterplot_data(self, breakdown_threshold):
        """
        This method allows to output the relevant data for building the scatterplots showing, for each parent jurisdi-
        ction the relationship between partner jurisdictions’ share of multinational companies’ foreign unrelated-party
        revenues and their share of the Gross National Income (GNI) in the dataset.
        """
        oecd = self.get_revenue_data_with_GNI()

        # We eliminate foreign jurisdictions totals and rows corresponding to domestic activities
        oecd = oecd[oecd['AFFILIATE_COUNTRY_NAME'] != 'Foreign Jurisdictions Total'].copy()
        oecd = oecd[oecd['PARENT_COUNTRY_CODE'] != oecd['AFFILIATE_COUNTRY_CODE']].copy()

        # The "NB_AFFILIATE_COUNTRIES" column shows the number of partners reported by the corresponding parent country
        oecd['NB_AFFILIATE_COUNTRIES'] = oecd['PARENT_COUNTRY_CODE'].map(
            lambda code: (oecd['PARENT_COUNTRY_CODE'] == code).sum()
        )

        # We only show the scatterplot for parent countries that report a minimum number of partner jurisdictions
        oecd_restricted = oecd[oecd['NB_AFFILIATE_COUNTRIES'] >= breakdown_threshold].copy()

        return oecd_restricted.dropna()

    def show_scatterplots(self, breakdown_threshold=60):
        """
        Building upon the "get_scatterplot_data" method, this method allows to output the scatterplots showing, for each
        parent jurisdiction in the OECD's country-by-country statistics, the relationship between partner jurisdictions’
        share of multinational companies’ foreign unrelated-party revenues and their share of the Gross National Income
        (GNI) in the dataset.
        """

        # We fetch the data to be displayed within the scatterplots thanks to the dedicated method
        oecd_restricted = self.get_scatterplot_data(breakdown_threshold=breakdown_threshold)

        # The number of parent countries determines the number of graphs to construct
        nb_countries = oecd_restricted['PARENT_COUNTRY_CODE'].nunique()

        ncols = 2
        nrows = int(nb_countries / ncols) + 1

        fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(17, 50))

        # We plot the graph for each parent country in the preprocessed country-by-country statistics
        for parent_country, ax in zip(oecd_restricted['PARENT_COUNTRY_CODE'].unique(), axes.flatten()):

            restricted_df = oecd_restricted[oecd_restricted['PARENT_COUNTRY_CODE'] == parent_country].copy()

            restricted_df['SHARE_OF_UNRELATED_PARTY_REVENUES'] = (
                restricted_df['UNRELATED_PARTY_REVENUES'] / restricted_df['UNRELATED_PARTY_REVENUES'].sum()
            )

            restricted_df['SHARE_OF_GNI_2016'] = (
                restricted_df['GNI_2016'] / restricted_df['GNI_2016'].sum()
            )

            sns.scatterplot(
                x='SHARE_OF_GNI_2016',
                y='SHARE_OF_UNRELATED_PARTY_REVENUES',
                hue='IS_TAX_HAVEN',
                data=restricted_df,
                ax=ax
            )

            ax.set_title(parent_country)

        plt.show()

Methods

def get_preprocessed_revenue_data(self)

This class method, relying on the pre-loaded data, applies a series of preprocessing steps to the raw data. The latter has indeed been loaded from the complete CSV file with a lot of non-relevant fields and a specific orga- nisation. In particular, we pivot the DataFrame to move from a long to a wide dataset.

Expand source code
def get_preprocessed_revenue_data(self):
    """
    This class method, relying on the pre-loaded data, applies a series of preprocessing steps to the raw data. The
    latter has indeed been loaded from the complete CSV file with a lot of non-relevant fields and a specific orga-
    nisation. In particular, we pivot the DataFrame to move from a long to a wide dataset.
    """
    if self.data is None:
        raise Exception('You must load the data with the dedicated method before you may run any computation.')

    cbcr = self.data.copy()

    # We focus on the positive-profit sub-sample only
    cbcr = cbcr[cbcr['PAN'] == 'PANELA'].copy()

    # And on the 2016 year
    cbcr = cbcr[cbcr['Year'] == 2016].copy()

    cbcr.drop(
        columns=['PAN', 'Grouping', 'Flag Codes', 'Flags', 'YEA', 'Year'],
        inplace=True
    )

    # We reshape the DataFrame from long into wide format, columns corresponding to the various financial variables
    cbcr_wide = cbcr.pivot(
        index=['COU', 'Ultimate Parent Jurisdiction', 'JUR', 'Partner Jurisdiction'],
        columns='Variable',
        values='Value'
    ).reset_index()

    # We limit ourselves to the relevant variables
    oecd = cbcr_wide[
        [
            'COU', 'Ultimate Parent Jurisdiction', 'JUR', 'Partner Jurisdiction',
            'Related Party Revenues', 'Total Revenues', 'Unrelated Party Revenues'
        ]
    ].copy()

    oecd.rename(
        columns={
            'COU': 'PARENT_COUNTRY_CODE',
            'Ultimate Parent Jurisdiction': 'PARENT_COUNTRY_NAME',
            'JUR': 'AFFILIATE_COUNTRY_CODE',
            'Partner Jurisdiction': 'AFFILIATE_COUNTRY_NAME',
            'Related Party Revenues': 'RELATED_PARTY_REVENUES',
            'Total Revenues': 'TOTAL_REVENUES',
            'Unrelated Party Revenues': 'UNRELATED_PARTY_REVENUES'
        },
        inplace=True
    )

    oecd.rename_axis(None, axis=1, inplace=True)

    # We eliminate Stateless entities
    oecd = oecd[oecd['AFFILIATE_COUNTRY_NAME'] != 'Stateless'].copy()

    # We eliminate countries with minimum reporting (only a domestic / foreign split)
    mask = oecd['PARENT_COUNTRY_CODE'].map(
        lambda code: (oecd['PARENT_COUNTRY_CODE'] == code).sum()
    ) > 2
    oecd = oecd[mask].copy()

    # And we can now eliminate Foreign Jurisdictions Total rows
    oecd = oecd[oecd['AFFILIATE_COUNTRY_NAME'] != 'Foreign Jurisdictions Total'].copy()

    # We group the rows corresponding to UK Caribbean Islands
    oecd['AFFILIATE_COUNTRY_CODE'] = oecd['AFFILIATE_COUNTRY_CODE'].map(
        lambda country_code: country_code if country_code not in UK_CARIBBEAN_ISLANDS else 'UKI'
    )
    oecd['AFFILIATE_COUNTRY_NAME'] = oecd.apply(
        (
            lambda row: row['AFFILIATE_COUNTRY_NAME']
            if row['AFFILIATE_COUNTRY_CODE'] != 'UKI' else 'UK Caribbean Islands'
        ),
        axis=1
    )
    oecd = oecd.groupby(
        ['PARENT_COUNTRY_CODE', 'PARENT_COUNTRY_NAME', 'AFFILIATE_COUNTRY_CODE', 'AFFILIATE_COUNTRY_NAME']
    ).sum().reset_index()

    # We add continent codes to the dataset from the "geographies.csv" file
    geographies = pd.read_csv(self.path_to_geographies)

    oecd = oecd.merge(
        geographies[['CODE', 'CONTINENT_CODE']].drop_duplicates(),
        how='left',
        left_on='AFFILIATE_COUNTRY_CODE', right_on='CODE'
    )

    oecd.drop(columns=['CODE'], inplace=True)

    oecd['CONTINENT_CODE'] = oecd.apply(
        lambda row: impute_missing_continent_codes(row, self.continent_code_imputations),
        axis=1
    )

    # We limit ourselves to 4 continents (Europe, Africa, America and Asia-Pacific)
    oecd['CONTINENT_CODE'] = oecd['CONTINENT_CODE'].map(
        lambda x: 'APAC' if x in ['ASIA', 'OCN'] or x is None else x
    )

    oecd['CONTINENT_CODE'] = oecd['CONTINENT_CODE'].map(
        lambda x: 'AMR' if x in ['SAMR', 'NAMR'] else x
    )

    # In 2016, one row (India - Bouvet Island) which is full of 0s; we eliminate it
    oecd = oecd[oecd['CONTINENT_CODE'] != 'ATC'].copy()

    return oecd.copy()
def get_revenue_data_with_GNI(self)

Building upon the "get_revenue_data_with_tax_havens" method, we add the Gross National Income (GNI) variables and return the resulting dataset.

Expand source code
def get_revenue_data_with_GNI(self):
    """
    Building upon the "get_revenue_data_with_tax_havens" method, we add the Gross National Income (GNI) variables
    and return the resulting dataset.
    """
    oecd = self.get_revenue_data_with_tax_havens()

    # We load the data from the CSV file
    gross_national_income = pd.read_csv(self.path_to_GNI_data, delimiter=';')

    # All numeric columns in the file must be slightly preprocessed to be considered as floats
    for column in gross_national_income.columns[2:]:

        gross_national_income[column] = gross_national_income[column].map(
            lambda x: x.replace(',', '.') if isinstance(x, str) else x
        )

        gross_national_income[column] = gross_national_income[column].astype(float)

    # We add GNI variables to the main dataset
    oecd = oecd.merge(
        gross_national_income[['COUNTRY_CODE', 'GNI_2016']].copy(),
        how='left',
        left_on='AFFILIATE_COUNTRY_CODE', right_on='COUNTRY_CODE'
    )

    oecd.drop(columns=['COUNTRY_CODE'], inplace=True)

    return oecd.copy()
def get_revenue_data_with_tax_havens(self)

Building upon the "get_preprocessed_revenue_data" method, we add an indicator variable - IS_TAX_HAVEN - valued to 1 if the country code is identified in the TWZ list or to 0 if not.

Expand source code
def get_revenue_data_with_tax_havens(self):
    """
    Building upon the "get_preprocessed_revenue_data" method, we add an indicator variable - IS_TAX_HAVEN - valued
    to 1 if the country code is identified in the TWZ list or to 0 if not.
    """
    oecd = self.get_preprocessed_revenue_data()

    tax_havens = pd.read_csv(self.path_to_tax_haven_list)

    oecd['IS_TAX_HAVEN'] = oecd['AFFILIATE_COUNTRY_CODE'].isin(tax_havens['CODE'].unique())

    return oecd.copy()
def get_scatterplot_data(self, breakdown_threshold)

This method allows to output the relevant data for building the scatterplots showing, for each parent jurisdi- ction the relationship between partner jurisdictions’ share of multinational companies’ foreign unrelated-party revenues and their share of the Gross National Income (GNI) in the dataset.

Expand source code
def get_scatterplot_data(self, breakdown_threshold):
    """
    This method allows to output the relevant data for building the scatterplots showing, for each parent jurisdi-
    ction the relationship between partner jurisdictions’ share of multinational companies’ foreign unrelated-party
    revenues and their share of the Gross National Income (GNI) in the dataset.
    """
    oecd = self.get_revenue_data_with_GNI()

    # We eliminate foreign jurisdictions totals and rows corresponding to domestic activities
    oecd = oecd[oecd['AFFILIATE_COUNTRY_NAME'] != 'Foreign Jurisdictions Total'].copy()
    oecd = oecd[oecd['PARENT_COUNTRY_CODE'] != oecd['AFFILIATE_COUNTRY_CODE']].copy()

    # The "NB_AFFILIATE_COUNTRIES" column shows the number of partners reported by the corresponding parent country
    oecd['NB_AFFILIATE_COUNTRIES'] = oecd['PARENT_COUNTRY_CODE'].map(
        lambda code: (oecd['PARENT_COUNTRY_CODE'] == code).sum()
    )

    # We only show the scatterplot for parent countries that report a minimum number of partner jurisdictions
    oecd_restricted = oecd[oecd['NB_AFFILIATE_COUNTRIES'] >= breakdown_threshold].copy()

    return oecd_restricted.dropna()
def load_raw_data(self)

This class method allows to load the OECD's anonymized and aggregated data from the pre-constructed string path (see the instantiation function above), in case it has not been done in the instantiation function (after having set load_raw_data=False). Data are stored in a dedicated class attribute.

Expand source code
def load_raw_data(self):
    """
    This class method allows to load the OECD's anonymized and aggregated data from the pre-constructed string path
    (see the instantiation function above), in case it has not been done in the instantiation function (after having
    set load_raw_data=False). Data are stored in a dedicated class attribute.
    """
    self.data = pd.read_csv(self.path_to_OECD_CbCR_data)
def show_scatterplots(self, breakdown_threshold=60)

Building upon the "get_scatterplot_data" method, this method allows to output the scatterplots showing, for each parent jurisdiction in the OECD's country-by-country statistics, the relationship between partner jurisdictions’ share of multinational companies’ foreign unrelated-party revenues and their share of the Gross National Income (GNI) in the dataset.

Expand source code
def show_scatterplots(self, breakdown_threshold=60):
    """
    Building upon the "get_scatterplot_data" method, this method allows to output the scatterplots showing, for each
    parent jurisdiction in the OECD's country-by-country statistics, the relationship between partner jurisdictions’
    share of multinational companies’ foreign unrelated-party revenues and their share of the Gross National Income
    (GNI) in the dataset.
    """

    # We fetch the data to be displayed within the scatterplots thanks to the dedicated method
    oecd_restricted = self.get_scatterplot_data(breakdown_threshold=breakdown_threshold)

    # The number of parent countries determines the number of graphs to construct
    nb_countries = oecd_restricted['PARENT_COUNTRY_CODE'].nunique()

    ncols = 2
    nrows = int(nb_countries / ncols) + 1

    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(17, 50))

    # We plot the graph for each parent country in the preprocessed country-by-country statistics
    for parent_country, ax in zip(oecd_restricted['PARENT_COUNTRY_CODE'].unique(), axes.flatten()):

        restricted_df = oecd_restricted[oecd_restricted['PARENT_COUNTRY_CODE'] == parent_country].copy()

        restricted_df['SHARE_OF_UNRELATED_PARTY_REVENUES'] = (
            restricted_df['UNRELATED_PARTY_REVENUES'] / restricted_df['UNRELATED_PARTY_REVENUES'].sum()
        )

        restricted_df['SHARE_OF_GNI_2016'] = (
            restricted_df['GNI_2016'] / restricted_df['GNI_2016'].sum()
        )

        sns.scatterplot(
            x='SHARE_OF_GNI_2016',
            y='SHARE_OF_UNRELATED_PARTY_REVENUES',
            hue='IS_TAX_HAVEN',
            data=restricted_df,
            ax=ax
        )

        ax.set_title(parent_country)

    plt.show()