Module `destination_based_sales.revenue_split`

This module is used to split the revenue variables of the IRS country-by-country data into three categories (sales to the affiliate country, sales to the US and sales to any third country), based on BEA data. It therefore relies on other Python files, essentially irs.py and bea.py. The methodology is detailed either in the PDF report or in the docstrings and comments below.

Expand source code

"""
This module is used to split the revenue variables of the IRS country-by-country data into three categories (sales to
the affiliate country, sales to the US and sales to any third country), based on BEA data. It therefore relies on other
Python files, essentially irs.py and bea.py. The methodology is detailed either in the PDF report or in the docstrings
and comments below.
"""


########################################################################################################################
# --- Imports

import os

import numpy as np
import pandas as pd

from destination_based_sales.irs import IRSDataPreprocessor
from destination_based_sales.bea import BEADataPreprocessor

from destination_based_sales.utils import eliminate_irrelevant_percentages, impute_missing_values


########################################################################################################################
# --- Diverse

path_to_dir = os.path.dirname(os.path.abspath(__file__))


########################################################################################################################
# --- Content

class RevenueSplitter:

    def __init__(
        self,
        year,
        include_US=True,
        path_to_dir=path_to_dir
    ):
        """
        The logic allowing to split revenue variables is encapsulated in a Python class, RevenueSplitter.

        This is the instantiation function for this class, which requires the following arguments:

        - the year to consider;
        - a boolean, indicating whether or not to include US-US sales in the split;
        - and the path to the directory where the Python file is located, to retrieve the necessary data.
        """
        self.year = year

        self.irs_preprocessor = IRSDataPreprocessor(year=year)
        self.bea_preprocessor = BEADataPreprocessor(year=year)

        self.include_US = include_US

        # We reconstruct the path to the Excel file that contains the BEA data we use for the split of US-US sales
        self.path_to_BEA_KR_tables = os.path.join(
            path_to_dir,
            'data',
            str(year),
            'Part-I-K1-R2.xls'
        )

    def merge_dataframes(self, include_US=True):
        """
        This class method is used to combine the IRS and BEA dataset. If US-US sales are excluded, it consists in a sim-
        ple merge, with a few duplicate columns to filter out. On the other hand, if US-US sales are included, since
        their split is based on a secondary file provided by the BEA, we have to operate the split separately and re-
        introduce it in the merged dataset. Whether to include or not the US-US sales is determined by the "include_US"
        boolean argument.
        """
        irs = self.irs_preprocessor.load_final_data()
        bea = self.bea_preprocessor.load_final_data()

        # We merge the two datasets (IRS and BEA)
        merged_df = irs.merge(
            bea,
            how='left',
            on='CODE'
        )

        merged_df.drop(
            columns=['AFFILIATE_COUNTRY_NAME_y', 'CONTINENT_NAME_y', 'CONTINENT_CODE_y', 'NAME'],
            inplace=True
        )

        merged_df.rename(
            columns={
                'AFFILIATE_COUNTRY_NAME_x': 'AFFILIATE_COUNTRY_NAME',
                'CONTINENT_NAME_x': 'CONTINENT_NAME',
                'CONTINENT_CODE_x': 'CONTINENT_CODE'
            },
            inplace=True
        )

        if include_US:
            # We load the secondary file from the BEA, with the split of US-US sales
            df = pd.read_excel(self.path_to_BEA_KR_tables, sheet_name='Table I.O 1')

            # Cleaning and reorganising the table
            column_names = df.loc[4].to_dict().copy()
            column_names[list(column_names.keys())[0]] = 'Industry'
            column_names['Unnamed: 1'] = 'Total'

            df.rename(columns=column_names, inplace=True)

            # Extracting information on the US-US sales
            us_sales = df.loc[6].to_dict()

            us_imputation = {}

            # Imputing the BEA-like distribution of US-US sales into the merged DataFrame
            # Sales to the affiliate country and to the headquarter country are directed to the same final destination
            for column in merged_df.columns[-11:]:

                if column == 'TOTAL':
                    us_imputation[column] = us_sales['Total']

                elif 'US' in column:
                    us_imputation[column] = us_sales['To U.S. persons'] * 1

                elif 'AFFILIATE_COUNTRY' in column:
                    us_imputation[column] = us_sales['To U.S. persons'] * 0

                else:
                    us_imputation[column] = us_sales['To foreign affiliates'] + us_sales['To other foreign persons']

            for column in merged_df.columns[-11:]:
                merged_df[column] = merged_df.apply(
                    lambda row: us_imputation[column] if row['CODE'] == 'USA' else row[column],
                    axis=1
                )

            return merged_df.copy()

        else:

            merged_df = merged_df[merged_df['CODE'] != 'USA'].copy()

            return merged_df.copy()

    def add_indicator_variables(self):
        """
        Building upon the previous method, "merge_dataframes", this method complements the dataset obtained by merging
        data from the IRS and those from the BEA with indicator variables that indicate, for each partner country,
        whether the information from the BEA is complete and allows to split the country-by-country revenue variables.
        In that sense, this method paves the way for the imputation of missing information (see below).

        NB: for many countries, all indicator variables will take value 0 simply because the partner country of the IRS'
        country-by-country statistics is absent from BEA data.
        """

        # We get the merged DataFrame
        merged_df = self.merge_dataframes(include_US=self.include_US)

        mask_US = (merged_df['CODE'] == 'USA')

        # Is the split of related-party sales complete?
        related = ['TOTAL_US_RELATED', 'TOTAL_AFFILIATE_COUNTRY_RELATED', 'TOTAL_OTHER_COUNTRY_RELATED']

        mask_0 = ~merged_df[related[0]].isnull()
        mask_1 = ~merged_df[related[1]].isnull()
        mask_2 = ~merged_df[related[2]].isnull()

        mask = np.logical_and(
            mask_0,
            np.logical_and(
                mask_1,
                mask_2
            )
        )

        # Takes value 0 if the split is incomplete, 1 if the split is complete and 2 in the US-US case
        merged_df['IS_RELATED_COMPLETE'] = mask * 1 + mask_US * 1

        self.related = related.copy()

        # Is the split of unrelated-party sales complete?
        unrelated = ['TOTAL_US_UNRELATED', 'TOTAL_AFFILIATE_COUNTRY_UNRELATED', 'TOTAL_OTHER_COUNTRY_UNRELATED']

        mask_0 = ~merged_df[unrelated[0]].isnull()
        mask_1 = ~merged_df[unrelated[1]].isnull()
        mask_2 = ~merged_df[unrelated[2]].isnull()

        mask = np.logical_and(
            mask_0,
            np.logical_and(
                mask_1,
                mask_2
            )
        )

        # Takes value 0 if the split is incomplete, 1 if the split is complete and 2 in the US-US case
        merged_df['IS_UNRELATED_COMPLETE'] = mask * 1 + mask_US * 1

        self.unrelated = unrelated.copy()

        # Is the split of total sales complete?
        total = ['TOTAL_US', 'TOTAL_AFFILIATE_COUNTRY', 'TOTAL_OTHER_COUNTRY']

        mask_0 = ~merged_df[total[0]].isnull()
        mask_1 = ~merged_df[total[1]].isnull()
        mask_2 = ~merged_df[total[2]].isnull()

        mask = np.logical_and(
            mask_0,
            np.logical_and(
                mask_1,
                mask_2
            )
        )

        # Takes value 0 if the split is incomplete, 1 if the split is complete and 2 in the US-US case
        merged_df['IS_TOTAL_COMPLETE'] = mask * 1 + mask_US * 1

        self.total = total.copy()

        return merged_df.copy()

    def compute_revenue_percentages(self):
        """
        In order to split the country-by-country revenue variables into three categories (sales to the host country,
        sales to the headquarter country and sales to any other country), we move from absolute amounts in the BEA co-
        lumns to sales percentages. For unrelated-party, related-party and total revenues, we compute the share of these
        revenues that are directed to the three different categories of destinations. We therefore add 9 new columns to
        the DataFrame obtained from the "add_indicator_variables" method.
        """
        merged_df = self.add_indicator_variables()

        bases = ['TOTAL_US', 'TOTAL_AFFILIATE_COUNTRY', 'TOTAL_OTHER_COUNTRY']

        percentage_columns = []

        for sales_type in ['RELATED', 'UNRELATED', 'TOTAL']:
            if sales_type in ['RELATED', 'UNRELATED']:
                existing_columns = [column + '_' + sales_type for column in bases]

                total_column = 'TOTAL_' + sales_type

            else:
                existing_columns = bases.copy()

                total_column = 'TOTAL_COMPUTED'

            merged_df[total_column] = merged_df[existing_columns].sum(axis=1)

            for i, destination in enumerate(['US', 'AFFILIATE_COUNTRY', 'OTHER_COUNTRY']):
                new_column = '_'.join(['PERC', sales_type, destination])

                percentage_columns.append(new_column)

                merged_df[new_column] = merged_df[existing_columns[i]] / merged_df[total_column]

        self.percentage_columns = percentage_columns.copy()

        for column in percentage_columns:
            merged_df[column] = merged_df.apply(
                lambda row: eliminate_irrelevant_percentages(row, column),
                axis=1
            )

        return merged_df.copy()

    def build_imputations_dict(self):
        """
        For partner countries in the IRS data that are absent from BEA data or more generally, for which BEA data are
        incomplete, missing sales percentages are imputed at the continental level. This method allows to construct a
        dictionary that associates each of the 4 continent codes the distribution of unrelated-party, related-party and
        total sales between three types of destinations: host country, headquarter country and any other country.
        """
        merged_df = self.compute_revenue_percentages()

        imputations = {}

        # We iterate over continents
        for continent_code in merged_df['CONTINENT_CODE'].unique():

            # We restrict the dataset to the continent under consideration
            restricted_df = merged_df[merged_df['CONTINENT_CODE'] == continent_code].copy()

            # We build a dictionary with continent codes as keys and dictionaries (to be filled) as values
            imputations[continent_code] = {}

            # We iterate over sales categories
            for sales_type in ['UNRELATED', 'RELATED', 'TOTAL']:

                # We restrict the dataset to jurisdictions of the continent under consideration for which BEA data on
                # the given type of sales are complete
                indicator_column = '_'.join(['IS', sales_type, 'COMPLETE'])
                restricted_df = restricted_df[restricted_df[indicator_column] == 1].copy()

                # We aggregate total sales, sales to the host country, sales to the headquarter country and sales to any
                # other country over the restricted dataset, for a given type of sales
                sums = restricted_df.sum()

                if sales_type in ['UNRELATED', 'RELATED']:
                    suffix = '_' + sales_type
                    total_column = 'TOTAL' + suffix

                else:
                    suffix = ''
                    total_column = 'TOTAL_COMPUTED'

                for destination in ['US', 'AFFILIATE_COUNTRY', 'OTHER_COUNTRY']:
                    column = 'TOTAL_' + destination + suffix

                    # key corresponds to the name of the column in which we want to impute the missing value
                    key = '_'.join(['PERC', sales_type, destination])

                    # We compute the sales percentage for a given continent, type of sales and destination
                    imputations[continent_code][key] = sums.loc[column] / sums.loc[total_column]

        return imputations.copy()

    def impute_missing_percentages(self):
        """
        Building upon the previous method, "build_imputations_dict", this method loads the dataset from the "compute_
        revenue_percentages" and complements with the continental imputations applied to partner countries in the IRS
        data for which we lack some information in the BEA data to distribute revenues.
        """

        # We load the dataset to be complemented
        merged_df = self.compute_revenue_percentages()

        # We construct the continental imputation dictionary
        imputations = self.build_imputations_dict()

        # We impute missing values thanks to the pre-constructed dictionary
        for column in self.percentage_columns:
            merged_df[column] = merged_df.apply(
                lambda row: impute_missing_values(row, column, imputations),
                axis=1
            )

        # We drop absolute amounts from the BEA data
        # e are only interested in sales percentages to distribute the IRS revenue variables
        merged_df.drop(
            columns=self.related + self.unrelated + self.total + ['TOTAL_FOREIGN'],
            inplace=True
        )

        return merged_df.copy()

    def deduce_absolute_amounts(self):
        """
        From the BEA sales percentages that we compute and complement within the "impute_missing_percentages" method, we
        deduce a distribution (in absolute amounts) of the revenue variables of the IRS' country-by-country data, based
        on their estimated ultimate destination (usual three destination types).
        """

        # We load the merged table, extended with imputations
        merged_df = self.impute_missing_percentages()

        absolute_amount_columns = []

        # We iterate over the sales categories
        for column in ['UNRELATED_PARTY_REVENUES', 'RELATED_PARTY_REVENUES', 'TOTAL_REVENUES']:

            sales_type = column.split('_')[0]

            # And then over ultimate destination types
            for destination in ['US', 'OTHER_COUNTRY', 'AFFILIATE_COUNTRY']:

                new_column = column + '_TO_' + destination

                # And we construct the new column as (IRS absolute amounts * BEA sales percentages)
                merged_df[new_column] = (
                    merged_df[column] * merged_df['PERC_' + sales_type + '_' + destination]
                )

                absolute_amount_columns.append(new_column)

        self.absolute_amount_columns = absolute_amount_columns.copy()

        return merged_df.copy()

    def get_splitted_revenues(self):
        """
        This method is a simple extension of the previous one, "deduce_absolute_amounts", used to get the split of the
        revenue variables in the IRS' country-by-country data. We simply restrict the DataFrame returned by the "deduce_
        absolute_amounts" methods to our columns of interest.
        """

        merged_df = self.deduce_absolute_amounts()

        merged_df = merged_df[['AFFILIATE_COUNTRY_NAME', 'CODE'] + self.absolute_amount_columns].copy()

        return merged_df.copy()

    def get_sales_percentages(self):
        """
        This method is, similarly to "get_splitted_revenues", an extension of the "deduce_absolute_amounts" method. In-
        stead of focusing on absolute amount columns, we restrict the DataFrame to sales percentages computed based on
        BEA data.
        """

        merged_df = self.deduce_absolute_amounts()

        merged_df = merged_df[['AFFILIATE_COUNTRY_NAME', 'CODE'] + self.percentage_columns].copy()

        return merged_df.copy()

Classes

class RevenueSplitter (year, include_US=True, path_to_dir='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales')

The logic allowing to split revenue variables is encapsulated in a Python class, RevenueSplitter.

This is the instantiation function for this class, which requires the following arguments:

the year to consider;
a boolean, indicating whether or not to include US-US sales in the split;
and the path to the directory where the Python file is located, to retrieve the necessary data.

Expand source code

class RevenueSplitter:

    def __init__(
        self,
        year,
        include_US=True,
        path_to_dir=path_to_dir
    ):
        """
        The logic allowing to split revenue variables is encapsulated in a Python class, RevenueSplitter.

        This is the instantiation function for this class, which requires the following arguments:

        - the year to consider;
        - a boolean, indicating whether or not to include US-US sales in the split;
        - and the path to the directory where the Python file is located, to retrieve the necessary data.
        """
        self.year = year

        self.irs_preprocessor = IRSDataPreprocessor(year=year)
        self.bea_preprocessor = BEADataPreprocessor(year=year)

        self.include_US = include_US

        # We reconstruct the path to the Excel file that contains the BEA data we use for the split of US-US sales
        self.path_to_BEA_KR_tables = os.path.join(
            path_to_dir,
            'data',
            str(year),
            'Part-I-K1-R2.xls'
        )

    def merge_dataframes(self, include_US=True):
        """
        This class method is used to combine the IRS and BEA dataset. If US-US sales are excluded, it consists in a sim-
        ple merge, with a few duplicate columns to filter out. On the other hand, if US-US sales are included, since
        their split is based on a secondary file provided by the BEA, we have to operate the split separately and re-
        introduce it in the merged dataset. Whether to include or not the US-US sales is determined by the "include_US"
        boolean argument.
        """
        irs = self.irs_preprocessor.load_final_data()
        bea = self.bea_preprocessor.load_final_data()

        # We merge the two datasets (IRS and BEA)
        merged_df = irs.merge(
            bea,
            how='left',
            on='CODE'
        )

        merged_df.drop(
            columns=['AFFILIATE_COUNTRY_NAME_y', 'CONTINENT_NAME_y', 'CONTINENT_CODE_y', 'NAME'],
            inplace=True
        )

        merged_df.rename(
            columns={
                'AFFILIATE_COUNTRY_NAME_x': 'AFFILIATE_COUNTRY_NAME',
                'CONTINENT_NAME_x': 'CONTINENT_NAME',
                'CONTINENT_CODE_x': 'CONTINENT_CODE'
            },
            inplace=True
        )

        if include_US:
            # We load the secondary file from the BEA, with the split of US-US sales
            df = pd.read_excel(self.path_to_BEA_KR_tables, sheet_name='Table I.O 1')

            # Cleaning and reorganising the table
            column_names = df.loc[4].to_dict().copy()
            column_names[list(column_names.keys())[0]] = 'Industry'
            column_names['Unnamed: 1'] = 'Total'

            df.rename(columns=column_names, inplace=True)

            # Extracting information on the US-US sales
            us_sales = df.loc[6].to_dict()

            us_imputation = {}

            # Imputing the BEA-like distribution of US-US sales into the merged DataFrame
            # Sales to the affiliate country and to the headquarter country are directed to the same final destination
            for column in merged_df.columns[-11:]:

                if column == 'TOTAL':
                    us_imputation[column] = us_sales['Total']

                elif 'US' in column:
                    us_imputation[column] = us_sales['To U.S. persons'] * 1

                elif 'AFFILIATE_COUNTRY' in column:
                    us_imputation[column] = us_sales['To U.S. persons'] * 0

                else:
                    us_imputation[column] = us_sales['To foreign affiliates'] + us_sales['To other foreign persons']

            for column in merged_df.columns[-11:]:
                merged_df[column] = merged_df.apply(
                    lambda row: us_imputation[column] if row['CODE'] == 'USA' else row[column],
                    axis=1
                )

            return merged_df.copy()

        else:

            merged_df = merged_df[merged_df['CODE'] != 'USA'].copy()

            return merged_df.copy()

    def add_indicator_variables(self):
        """
        Building upon the previous method, "merge_dataframes", this method complements the dataset obtained by merging
        data from the IRS and those from the BEA with indicator variables that indicate, for each partner country,
        whether the information from the BEA is complete and allows to split the country-by-country revenue variables.
        In that sense, this method paves the way for the imputation of missing information (see below).

        NB: for many countries, all indicator variables will take value 0 simply because the partner country of the IRS'
        country-by-country statistics is absent from BEA data.
        """

        # We get the merged DataFrame
        merged_df = self.merge_dataframes(include_US=self.include_US)

        mask_US = (merged_df['CODE'] == 'USA')

        # Is the split of related-party sales complete?
        related = ['TOTAL_US_RELATED', 'TOTAL_AFFILIATE_COUNTRY_RELATED', 'TOTAL_OTHER_COUNTRY_RELATED']

        mask_0 = ~merged_df[related[0]].isnull()
        mask_1 = ~merged_df[related[1]].isnull()
        mask_2 = ~merged_df[related[2]].isnull()

        mask = np.logical_and(
            mask_0,
            np.logical_and(
                mask_1,
                mask_2
            )
        )

        # Takes value 0 if the split is incomplete, 1 if the split is complete and 2 in the US-US case
        merged_df['IS_RELATED_COMPLETE'] = mask * 1 + mask_US * 1

        self.related = related.copy()

        # Is the split of unrelated-party sales complete?
        unrelated = ['TOTAL_US_UNRELATED', 'TOTAL_AFFILIATE_COUNTRY_UNRELATED', 'TOTAL_OTHER_COUNTRY_UNRELATED']

        mask_0 = ~merged_df[unrelated[0]].isnull()
        mask_1 = ~merged_df[unrelated[1]].isnull()
        mask_2 = ~merged_df[unrelated[2]].isnull()

        mask = np.logical_and(
            mask_0,
            np.logical_and(
                mask_1,
                mask_2
            )
        )

        # Takes value 0 if the split is incomplete, 1 if the split is complete and 2 in the US-US case
        merged_df['IS_UNRELATED_COMPLETE'] = mask * 1 + mask_US * 1

        self.unrelated = unrelated.copy()

        # Is the split of total sales complete?
        total = ['TOTAL_US', 'TOTAL_AFFILIATE_COUNTRY', 'TOTAL_OTHER_COUNTRY']

        mask_0 = ~merged_df[total[0]].isnull()
        mask_1 = ~merged_df[total[1]].isnull()
        mask_2 = ~merged_df[total[2]].isnull()

        mask = np.logical_and(
            mask_0,
            np.logical_and(
                mask_1,
                mask_2
            )
        )

        # Takes value 0 if the split is incomplete, 1 if the split is complete and 2 in the US-US case
        merged_df['IS_TOTAL_COMPLETE'] = mask * 1 + mask_US * 1

        self.total = total.copy()

        return merged_df.copy()

    def compute_revenue_percentages(self):
        """
        In order to split the country-by-country revenue variables into three categories (sales to the host country,
        sales to the headquarter country and sales to any other country), we move from absolute amounts in the BEA co-
        lumns to sales percentages. For unrelated-party, related-party and total revenues, we compute the share of these
        revenues that are directed to the three different categories of destinations. We therefore add 9 new columns to
        the DataFrame obtained from the "add_indicator_variables" method.
        """
        merged_df = self.add_indicator_variables()

        bases = ['TOTAL_US', 'TOTAL_AFFILIATE_COUNTRY', 'TOTAL_OTHER_COUNTRY']

        percentage_columns = []

        for sales_type in ['RELATED', 'UNRELATED', 'TOTAL']:
            if sales_type in ['RELATED', 'UNRELATED']:
                existing_columns = [column + '_' + sales_type for column in bases]

                total_column = 'TOTAL_' + sales_type

            else:
                existing_columns = bases.copy()

                total_column = 'TOTAL_COMPUTED'

            merged_df[total_column] = merged_df[existing_columns].sum(axis=1)

            for i, destination in enumerate(['US', 'AFFILIATE_COUNTRY', 'OTHER_COUNTRY']):
                new_column = '_'.join(['PERC', sales_type, destination])

                percentage_columns.append(new_column)

                merged_df[new_column] = merged_df[existing_columns[i]] / merged_df[total_column]

        self.percentage_columns = percentage_columns.copy()

        for column in percentage_columns:
            merged_df[column] = merged_df.apply(
                lambda row: eliminate_irrelevant_percentages(row, column),
                axis=1
            )

        return merged_df.copy()

    def build_imputations_dict(self):
        """
        For partner countries in the IRS data that are absent from BEA data or more generally, for which BEA data are
        incomplete, missing sales percentages are imputed at the continental level. This method allows to construct a
        dictionary that associates each of the 4 continent codes the distribution of unrelated-party, related-party and
        total sales between three types of destinations: host country, headquarter country and any other country.
        """
        merged_df = self.compute_revenue_percentages()

        imputations = {}

        # We iterate over continents
        for continent_code in merged_df['CONTINENT_CODE'].unique():

            # We restrict the dataset to the continent under consideration
            restricted_df = merged_df[merged_df['CONTINENT_CODE'] == continent_code].copy()

            # We build a dictionary with continent codes as keys and dictionaries (to be filled) as values
            imputations[continent_code] = {}

            # We iterate over sales categories
            for sales_type in ['UNRELATED', 'RELATED', 'TOTAL']:

                # We restrict the dataset to jurisdictions of the continent under consideration for which BEA data on
                # the given type of sales are complete
                indicator_column = '_'.join(['IS', sales_type, 'COMPLETE'])
                restricted_df = restricted_df[restricted_df[indicator_column] == 1].copy()

                # We aggregate total sales, sales to the host country, sales to the headquarter country and sales to any
                # other country over the restricted dataset, for a given type of sales
                sums = restricted_df.sum()

                if sales_type in ['UNRELATED', 'RELATED']:
                    suffix = '_' + sales_type
                    total_column = 'TOTAL' + suffix

                else:
                    suffix = ''
                    total_column = 'TOTAL_COMPUTED'

                for destination in ['US', 'AFFILIATE_COUNTRY', 'OTHER_COUNTRY']:
                    column = 'TOTAL_' + destination + suffix

                    # key corresponds to the name of the column in which we want to impute the missing value
                    key = '_'.join(['PERC', sales_type, destination])

                    # We compute the sales percentage for a given continent, type of sales and destination
                    imputations[continent_code][key] = sums.loc[column] / sums.loc[total_column]

        return imputations.copy()

    def impute_missing_percentages(self):
        """
        Building upon the previous method, "build_imputations_dict", this method loads the dataset from the "compute_
        revenue_percentages" and complements with the continental imputations applied to partner countries in the IRS
        data for which we lack some information in the BEA data to distribute revenues.
        """

        # We load the dataset to be complemented
        merged_df = self.compute_revenue_percentages()

        # We construct the continental imputation dictionary
        imputations = self.build_imputations_dict()

        # We impute missing values thanks to the pre-constructed dictionary
        for column in self.percentage_columns:
            merged_df[column] = merged_df.apply(
                lambda row: impute_missing_values(row, column, imputations),
                axis=1
            )

        # We drop absolute amounts from the BEA data
        # e are only interested in sales percentages to distribute the IRS revenue variables
        merged_df.drop(
            columns=self.related + self.unrelated + self.total + ['TOTAL_FOREIGN'],
            inplace=True
        )

        return merged_df.copy()

    def deduce_absolute_amounts(self):
        """
        From the BEA sales percentages that we compute and complement within the "impute_missing_percentages" method, we
        deduce a distribution (in absolute amounts) of the revenue variables of the IRS' country-by-country data, based
        on their estimated ultimate destination (usual three destination types).
        """

        # We load the merged table, extended with imputations
        merged_df = self.impute_missing_percentages()

        absolute_amount_columns = []

        # We iterate over the sales categories
        for column in ['UNRELATED_PARTY_REVENUES', 'RELATED_PARTY_REVENUES', 'TOTAL_REVENUES']:

            sales_type = column.split('_')[0]

            # And then over ultimate destination types
            for destination in ['US', 'OTHER_COUNTRY', 'AFFILIATE_COUNTRY']:

                new_column = column + '_TO_' + destination

                # And we construct the new column as (IRS absolute amounts * BEA sales percentages)
                merged_df[new_column] = (
                    merged_df[column] * merged_df['PERC_' + sales_type + '_' + destination]
                )

                absolute_amount_columns.append(new_column)

        self.absolute_amount_columns = absolute_amount_columns.copy()

        return merged_df.copy()

    def get_splitted_revenues(self):
        """
        This method is a simple extension of the previous one, "deduce_absolute_amounts", used to get the split of the
        revenue variables in the IRS' country-by-country data. We simply restrict the DataFrame returned by the "deduce_
        absolute_amounts" methods to our columns of interest.
        """

        merged_df = self.deduce_absolute_amounts()

        merged_df = merged_df[['AFFILIATE_COUNTRY_NAME', 'CODE'] + self.absolute_amount_columns].copy()

        return merged_df.copy()

    def get_sales_percentages(self):
        """
        This method is, similarly to "get_splitted_revenues", an extension of the "deduce_absolute_amounts" method. In-
        stead of focusing on absolute amount columns, we restrict the DataFrame to sales percentages computed based on
        BEA data.
        """

        merged_df = self.deduce_absolute_amounts()

        merged_df = merged_df[['AFFILIATE_COUNTRY_NAME', 'CODE'] + self.percentage_columns].copy()

        return merged_df.copy()

Methods

def add_indicator_variables(self)

Building upon the previous method, "merge_dataframes", this method complements the dataset obtained by merging data from the IRS and those from the BEA with indicator variables that indicate, for each partner country, whether the information from the BEA is complete and allows to split the country-by-country revenue variables. In that sense, this method paves the way for the imputation of missing information (see below).

NB: for many countries, all indicator variables will take value 0 simply because the partner country of the IRS' country-by-country statistics is absent from BEA data.

Expand source code

def add_indicator_variables(self):
    """
    Building upon the previous method, "merge_dataframes", this method complements the dataset obtained by merging
    data from the IRS and those from the BEA with indicator variables that indicate, for each partner country,
    whether the information from the BEA is complete and allows to split the country-by-country revenue variables.
    In that sense, this method paves the way for the imputation of missing information (see below).

    NB: for many countries, all indicator variables will take value 0 simply because the partner country of the IRS'
    country-by-country statistics is absent from BEA data.
    """

    # We get the merged DataFrame
    merged_df = self.merge_dataframes(include_US=self.include_US)

    mask_US = (merged_df['CODE'] == 'USA')

    # Is the split of related-party sales complete?
    related = ['TOTAL_US_RELATED', 'TOTAL_AFFILIATE_COUNTRY_RELATED', 'TOTAL_OTHER_COUNTRY_RELATED']

    mask_0 = ~merged_df[related[0]].isnull()
    mask_1 = ~merged_df[related[1]].isnull()
    mask_2 = ~merged_df[related[2]].isnull()

    mask = np.logical_and(
        mask_0,
        np.logical_and(
            mask_1,
            mask_2
        )
    )

    # Takes value 0 if the split is incomplete, 1 if the split is complete and 2 in the US-US case
    merged_df['IS_RELATED_COMPLETE'] = mask * 1 + mask_US * 1

    self.related = related.copy()

    # Is the split of unrelated-party sales complete?
    unrelated = ['TOTAL_US_UNRELATED', 'TOTAL_AFFILIATE_COUNTRY_UNRELATED', 'TOTAL_OTHER_COUNTRY_UNRELATED']

    mask_0 = ~merged_df[unrelated[0]].isnull()
    mask_1 = ~merged_df[unrelated[1]].isnull()
    mask_2 = ~merged_df[unrelated[2]].isnull()

    mask = np.logical_and(
        mask_0,
        np.logical_and(
            mask_1,
            mask_2
        )
    )

    # Takes value 0 if the split is incomplete, 1 if the split is complete and 2 in the US-US case
    merged_df['IS_UNRELATED_COMPLETE'] = mask * 1 + mask_US * 1

    self.unrelated = unrelated.copy()

    # Is the split of total sales complete?
    total = ['TOTAL_US', 'TOTAL_AFFILIATE_COUNTRY', 'TOTAL_OTHER_COUNTRY']

    mask_0 = ~merged_df[total[0]].isnull()
    mask_1 = ~merged_df[total[1]].isnull()
    mask_2 = ~merged_df[total[2]].isnull()

    mask = np.logical_and(
        mask_0,
        np.logical_and(
            mask_1,
            mask_2
        )
    )

    # Takes value 0 if the split is incomplete, 1 if the split is complete and 2 in the US-US case
    merged_df['IS_TOTAL_COMPLETE'] = mask * 1 + mask_US * 1

    self.total = total.copy()

    return merged_df.copy()

def build_imputations_dict(self)

For partner countries in the IRS data that are absent from BEA data or more generally, for which BEA data are incomplete, missing sales percentages are imputed at the continental level. This method allows to construct a dictionary that associates each of the 4 continent codes the distribution of unrelated-party, related-party and total sales between three types of destinations: host country, headquarter country and any other country.

Expand source code

def build_imputations_dict(self):
    """
    For partner countries in the IRS data that are absent from BEA data or more generally, for which BEA data are
    incomplete, missing sales percentages are imputed at the continental level. This method allows to construct a
    dictionary that associates each of the 4 continent codes the distribution of unrelated-party, related-party and
    total sales between three types of destinations: host country, headquarter country and any other country.
    """
    merged_df = self.compute_revenue_percentages()

    imputations = {}

    # We iterate over continents
    for continent_code in merged_df['CONTINENT_CODE'].unique():

        # We restrict the dataset to the continent under consideration
        restricted_df = merged_df[merged_df['CONTINENT_CODE'] == continent_code].copy()

        # We build a dictionary with continent codes as keys and dictionaries (to be filled) as values
        imputations[continent_code] = {}

        # We iterate over sales categories
        for sales_type in ['UNRELATED', 'RELATED', 'TOTAL']:

            # We restrict the dataset to jurisdictions of the continent under consideration for which BEA data on
            # the given type of sales are complete
            indicator_column = '_'.join(['IS', sales_type, 'COMPLETE'])
            restricted_df = restricted_df[restricted_df[indicator_column] == 1].copy()

            # We aggregate total sales, sales to the host country, sales to the headquarter country and sales to any
            # other country over the restricted dataset, for a given type of sales
            sums = restricted_df.sum()

            if sales_type in ['UNRELATED', 'RELATED']:
                suffix = '_' + sales_type
                total_column = 'TOTAL' + suffix

            else:
                suffix = ''
                total_column = 'TOTAL_COMPUTED'

            for destination in ['US', 'AFFILIATE_COUNTRY', 'OTHER_COUNTRY']:
                column = 'TOTAL_' + destination + suffix

                # key corresponds to the name of the column in which we want to impute the missing value
                key = '_'.join(['PERC', sales_type, destination])

                # We compute the sales percentage for a given continent, type of sales and destination
                imputations[continent_code][key] = sums.loc[column] / sums.loc[total_column]

    return imputations.copy()

def compute_revenue_percentages(self)

In order to split the country-by-country revenue variables into three categories (sales to the host country, sales to the headquarter country and sales to any other country), we move from absolute amounts in the BEA co- lumns to sales percentages. For unrelated-party, related-party and total revenues, we compute the share of these revenues that are directed to the three different categories of destinations. We therefore add 9 new columns to the DataFrame obtained from the "add_indicator_variables" method.

Expand source code

def compute_revenue_percentages(self):
    """
    In order to split the country-by-country revenue variables into three categories (sales to the host country,
    sales to the headquarter country and sales to any other country), we move from absolute amounts in the BEA co-
    lumns to sales percentages. For unrelated-party, related-party and total revenues, we compute the share of these
    revenues that are directed to the three different categories of destinations. We therefore add 9 new columns to
    the DataFrame obtained from the "add_indicator_variables" method.
    """
    merged_df = self.add_indicator_variables()

    bases = ['TOTAL_US', 'TOTAL_AFFILIATE_COUNTRY', 'TOTAL_OTHER_COUNTRY']

    percentage_columns = []

    for sales_type in ['RELATED', 'UNRELATED', 'TOTAL']:
        if sales_type in ['RELATED', 'UNRELATED']:
            existing_columns = [column + '_' + sales_type for column in bases]

            total_column = 'TOTAL_' + sales_type

        else:
            existing_columns = bases.copy()

            total_column = 'TOTAL_COMPUTED'

        merged_df[total_column] = merged_df[existing_columns].sum(axis=1)

        for i, destination in enumerate(['US', 'AFFILIATE_COUNTRY', 'OTHER_COUNTRY']):
            new_column = '_'.join(['PERC', sales_type, destination])

            percentage_columns.append(new_column)

            merged_df[new_column] = merged_df[existing_columns[i]] / merged_df[total_column]

    self.percentage_columns = percentage_columns.copy()

    for column in percentage_columns:
        merged_df[column] = merged_df.apply(
            lambda row: eliminate_irrelevant_percentages(row, column),
            axis=1
        )

    return merged_df.copy()

def deduce_absolute_amounts(self)

From the BEA sales percentages that we compute and complement within the "impute_missing_percentages" method, we deduce a distribution (in absolute amounts) of the revenue variables of the IRS' country-by-country data, based on their estimated ultimate destination (usual three destination types).

Expand source code

def deduce_absolute_amounts(self):
    """
    From the BEA sales percentages that we compute and complement within the "impute_missing_percentages" method, we
    deduce a distribution (in absolute amounts) of the revenue variables of the IRS' country-by-country data, based
    on their estimated ultimate destination (usual three destination types).
    """

    # We load the merged table, extended with imputations
    merged_df = self.impute_missing_percentages()

    absolute_amount_columns = []

    # We iterate over the sales categories
    for column in ['UNRELATED_PARTY_REVENUES', 'RELATED_PARTY_REVENUES', 'TOTAL_REVENUES']:

        sales_type = column.split('_')[0]

        # And then over ultimate destination types
        for destination in ['US', 'OTHER_COUNTRY', 'AFFILIATE_COUNTRY']:

            new_column = column + '_TO_' + destination

            # And we construct the new column as (IRS absolute amounts * BEA sales percentages)
            merged_df[new_column] = (
                merged_df[column] * merged_df['PERC_' + sales_type + '_' + destination]
            )

            absolute_amount_columns.append(new_column)

    self.absolute_amount_columns = absolute_amount_columns.copy()

    return merged_df.copy()

def get_sales_percentages(self)

This method is, similarly to "get_splitted_revenues", an extension of the "deduce_absolute_amounts" method. In- stead of focusing on absolute amount columns, we restrict the DataFrame to sales percentages computed based on BEA data.

Expand source code

def get_sales_percentages(self):
    """
    This method is, similarly to "get_splitted_revenues", an extension of the "deduce_absolute_amounts" method. In-
    stead of focusing on absolute amount columns, we restrict the DataFrame to sales percentages computed based on
    BEA data.
    """

    merged_df = self.deduce_absolute_amounts()

    merged_df = merged_df[['AFFILIATE_COUNTRY_NAME', 'CODE'] + self.percentage_columns].copy()

    return merged_df.copy()

def get_splitted_revenues(self)

This method is a simple extension of the previous one, "deduce_absolute_amounts", used to get the split of the revenue variables in the IRS' country-by-country data. We simply restrict the DataFrame returned by the "deduce_ absolute_amounts" methods to our columns of interest.

Expand source code

def get_splitted_revenues(self):
    """
    This method is a simple extension of the previous one, "deduce_absolute_amounts", used to get the split of the
    revenue variables in the IRS' country-by-country data. We simply restrict the DataFrame returned by the "deduce_
    absolute_amounts" methods to our columns of interest.
    """

    merged_df = self.deduce_absolute_amounts()

    merged_df = merged_df[['AFFILIATE_COUNTRY_NAME', 'CODE'] + self.absolute_amount_columns].copy()

    return merged_df.copy()

def impute_missing_percentages(self)

Building upon the previous method, "build_imputations_dict", this method loads the dataset from the "compute_ revenue_percentages" and complements with the continental imputations applied to partner countries in the IRS data for which we lack some information in the BEA data to distribute revenues.

Expand source code

def impute_missing_percentages(self):
    """
    Building upon the previous method, "build_imputations_dict", this method loads the dataset from the "compute_
    revenue_percentages" and complements with the continental imputations applied to partner countries in the IRS
    data for which we lack some information in the BEA data to distribute revenues.
    """

    # We load the dataset to be complemented
    merged_df = self.compute_revenue_percentages()

    # We construct the continental imputation dictionary
    imputations = self.build_imputations_dict()

    # We impute missing values thanks to the pre-constructed dictionary
    for column in self.percentage_columns:
        merged_df[column] = merged_df.apply(
            lambda row: impute_missing_values(row, column, imputations),
            axis=1
        )

    # We drop absolute amounts from the BEA data
    # e are only interested in sales percentages to distribute the IRS revenue variables
    merged_df.drop(
        columns=self.related + self.unrelated + self.total + ['TOTAL_FOREIGN'],
        inplace=True
    )

    return merged_df.copy()

def merge_dataframes(self, include_US=True)

This class method is used to combine the IRS and BEA dataset. If US-US sales are excluded, it consists in a sim- ple merge, with a few duplicate columns to filter out. On the other hand, if US-US sales are included, since their split is based on a secondary file provided by the BEA, we have to operate the split separately and re- introduce it in the merged dataset. Whether to include or not the US-US sales is determined by the "include_US" boolean argument.

Expand source code

def merge_dataframes(self, include_US=True):
    """
    This class method is used to combine the IRS and BEA dataset. If US-US sales are excluded, it consists in a sim-
    ple merge, with a few duplicate columns to filter out. On the other hand, if US-US sales are included, since
    their split is based on a secondary file provided by the BEA, we have to operate the split separately and re-
    introduce it in the merged dataset. Whether to include or not the US-US sales is determined by the "include_US"
    boolean argument.
    """
    irs = self.irs_preprocessor.load_final_data()
    bea = self.bea_preprocessor.load_final_data()

    # We merge the two datasets (IRS and BEA)
    merged_df = irs.merge(
        bea,
        how='left',
        on='CODE'
    )

    merged_df.drop(
        columns=['AFFILIATE_COUNTRY_NAME_y', 'CONTINENT_NAME_y', 'CONTINENT_CODE_y', 'NAME'],
        inplace=True
    )

    merged_df.rename(
        columns={
            'AFFILIATE_COUNTRY_NAME_x': 'AFFILIATE_COUNTRY_NAME',
            'CONTINENT_NAME_x': 'CONTINENT_NAME',
            'CONTINENT_CODE_x': 'CONTINENT_CODE'
        },
        inplace=True
    )

    if include_US:
        # We load the secondary file from the BEA, with the split of US-US sales
        df = pd.read_excel(self.path_to_BEA_KR_tables, sheet_name='Table I.O 1')

        # Cleaning and reorganising the table
        column_names = df.loc[4].to_dict().copy()
        column_names[list(column_names.keys())[0]] = 'Industry'
        column_names['Unnamed: 1'] = 'Total'

        df.rename(columns=column_names, inplace=True)

        # Extracting information on the US-US sales
        us_sales = df.loc[6].to_dict()

        us_imputation = {}

        # Imputing the BEA-like distribution of US-US sales into the merged DataFrame
        # Sales to the affiliate country and to the headquarter country are directed to the same final destination
        for column in merged_df.columns[-11:]:

            if column == 'TOTAL':
                us_imputation[column] = us_sales['Total']

            elif 'US' in column:
                us_imputation[column] = us_sales['To U.S. persons'] * 1

            elif 'AFFILIATE_COUNTRY' in column:
                us_imputation[column] = us_sales['To U.S. persons'] * 0

            else:
                us_imputation[column] = us_sales['To foreign affiliates'] + us_sales['To other foreign persons']

        for column in merged_df.columns[-11:]:
            merged_df[column] = merged_df.apply(
                lambda row: us_imputation[column] if row['CODE'] == 'USA' else row[column],
                axis=1
            )

        return merged_df.copy()

    else:

        merged_df = merged_df[merged_df['CODE'] != 'USA'].copy()

        return merged_df.copy()