Module `destination_based_sales.irs`

This module is used to load and preprocess the country-by-country data of the Internal Revenue Service (IRS). These pro- vide the three revenue variables that we aim at distributing based on their approximative ultimate destination. Data are loaded from Excel files saved in the "data" folder.

Expand source code

"""
This module is used to load and preprocess the country-by-country data of the Internal Revenue Service (IRS). These pro-
vide the three revenue variables that we aim at distributing based on their approximative ultimate destination. Data are
loaded from Excel files saved in the "data" folder.
"""


########################################################################################################################
# --- Imports

import os

import numpy as np
import pandas as pd

from destination_based_sales.utils import CODES_TO_IMPUTE_IRS, impute_missing_codes, UK_CARIBBEAN_ISLANDS


########################################################################################################################
# --- Diverse

path_to_dir = os.path.dirname(os.path.abspath(__file__))

path_to_irs_data = os.path.join(path_to_dir, 'data', '18it01acbc.xlsx')
path_to_geographies = os.path.join(path_to_dir, 'data', 'geographies.csv')


########################################################################################################################
# --- Content

class IRSDataPreprocessor:

    def __init__(
        self,
        year,
        path_to_dir=path_to_dir,
        path_to_geo_file=path_to_geographies
    ):
        """
        The instructions allowing to load and preprocess IRS data are organised in a Python class, IRSDataPreprocessor.

        This is the instantiation function for this class. It requires several arguments:

        - the year to consider (for now, one of 2016, 2017 or 2018);
        - the path to the directory where this Python file is located, to retrieve the appropriate data file;
        - the path to the "geographies.csv" file, used for instance to complement IRS data with country codes.
        """
        self.year = year

        # We reconstruct the path to the relevant Excel file, which depends on the year considered
        self.path_to_cbcr = os.path.join(
            path_to_dir,
            'data',
            str(year),
            f'{year - 2000}it01acbc.xlsx'
        )

        self.path_to_geo_file = path_to_geographies

        self.CODES_TO_IMPUTE = CODES_TO_IMPUTE_IRS.copy()
        self.UK_CARIBBEAN_ISLANDS = UK_CARIBBEAN_ISLANDS.copy()

    def load_data(self):
        """
        This class method is used to load and clean the data from the IRS. It relies on the data file paths, saved as
        class attributes when the instantiation function is called. Preprocessing steps are detailed in comments below.
        """

        # We load the data from the appropriate Excel file
        irs = pd.read_excel(
            self.path_to_cbcr,
            engine='openpyxl'
        )

        # We eliminate irrelevant columns and rename the appropriate ones
        irs.drop(
            columns=['Unnamed: 1'] + list(irs.columns[5:]),
            inplace=True
        )

        irs.columns = [
            'AFFILIATE_COUNTRY_NAME', 'UNRELATED_PARTY_REVENUES', 'RELATED_PARTY_REVENUES', 'TOTAL_REVENUES'
        ]

        # We filter out irrelevant rows
        # In particular, we eliminate rows corresponding to stateless entities and continental totals
        irs = irs.loc[5:].copy()

        mask_stateless = irs['AFFILIATE_COUNTRY_NAME'].map(
            lambda country_name: 'stateless' in country_name.lower()
        )
        mask_total = irs['AFFILIATE_COUNTRY_NAME'].map(
            lambda country_name: 'total' in country_name.lower()
        )
        mask = ~np.logical_or(
            mask_stateless, mask_total
        )

        irs = irs[mask].copy()

        irs = irs.iloc[:-4].copy()

        # We rename fields of the form "Other [+ CONTINENT_NAME]"
        irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
            (
                lambda country_name: ('Other ' + country_name.split(',')[0]).replace('&', 'and')
                if 'other' in country_name.lower() else country_name
            )
        )

        # We deal with a few specific country names
        irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
            lambda country_name: 'United Kingdom' if 'United Kingdom' in country_name else country_name
        )
        irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
            lambda country_name: 'Korea' if country_name.startswith('Korea') else country_name
        )
        irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
            lambda country_name: 'Congo' if country_name.endswith('(Brazzaville)') else country_name
        )

        irs.reset_index(drop=True, inplace=True)

        return irs.copy()

    def load_data_with_geo_codes(self):
        """
        This class method is used to add geographical ISO codes to the raw dataset, loaded with the "load_data" method.
        It relies on the "impute_missing_codes" function defined in utils.py.
        """
        irs = self.load_data()

        geographies = pd.read_csv(self.path_to_geo_file)

        # We merge the DataFrame containing raw IRS data with the one containing the ISO code correspondences
        merged_df = irs.merge(
            geographies,
            how='left',
            left_on='AFFILIATE_COUNTRY_NAME', right_on='NAME'
        )

        # We add missing codes
        for column in ['NAME', 'CODE', 'CONTINENT_NAME', 'CONTINENT_CODE']:
            merged_df[column] = merged_df.apply(
                lambda row: impute_missing_codes(
                    row=row,
                    column=column,
                    codes_to_impute=self.CODES_TO_IMPUTE
                ),
                axis=1
            )

        merged_df.drop(columns=['NAME'], inplace=True)

        return merged_df.copy()

    def load_data_with_UKI(self):
        """
        To match the BEA data, that show a "United Kingdom Islands, Caribbean" aggregate for all the related jurisdi-
        ctions, we do the same aggregation in the IRS data. Basically, we sum unrelated-party, related-party and total
        revenues for all the countries concerned and associate the totals to a new "United Kingdom Islands, Caribbean"
        affiliate country name. We then eliminate fields included in the aggregation.
        """
        irs = self.load_data_with_geo_codes()

        extract = irs[irs['CODE'].isin(self.UK_CARIBBEAN_ISLANDS)].copy()

        irs = irs[~irs['CODE'].isin(self.UK_CARIBBEAN_ISLANDS)].copy()

        irs.reset_index(inplace=True, drop=True)

        dict_df = irs.to_dict()

        dict_df[irs.columns[0]][len(irs)] = 'United Kingdom Islands, Caribbean'
        dict_df[irs.columns[1]][len(irs)] = extract['UNRELATED_PARTY_REVENUES'].sum()
        dict_df[irs.columns[2]][len(irs)] = extract['RELATED_PARTY_REVENUES'].sum()
        dict_df[irs.columns[3]][len(irs)] = extract['TOTAL_REVENUES'].sum()
        dict_df[irs.columns[4]][len(irs)] = 'UKI'
        dict_df[irs.columns[5]][len(irs)] = 'America'
        dict_df[irs.columns[6]][len(irs)] = 'AMR'

        irs = pd.DataFrame.from_dict(dict_df)

        return irs.copy()

    def load_final_data(self):
        """
        This class method allows to load the fully preprocessed IRS data. Relying on the "load_data_with_UKI" method,
        continent names and codes are limited to 4 pairs, corresponding respectively to Europe, Africa, America (North
        and South) and Asia-Pacific (gathering Asia and Oceania).
        """
        irs = self.load_data_with_UKI()

        irs['CONTINENT_CODE'] = irs['CONTINENT_CODE'].map(
            lambda x: 'AMR' if x in ['SAMR', 'NAMR'] else x
        )

        irs['CONTINENT_NAME'] = irs['CONTINENT_NAME'].map(
            lambda x: 'America' if x in ['South America', 'North America'] else x
        )

        irs['CONTINENT_CODE'] = irs['CONTINENT_CODE'].map(
            lambda x: 'APAC' if x in ['ASIA', 'OCN'] or x is None else x
        )

        irs['CONTINENT_NAME'] = irs['CONTINENT_NAME'].map(
            lambda x: 'Asia-Pacific' if x in ['Asia', 'Oceania'] or x is None else x
        )

        return irs.copy()

Classes

class IRSDataPreprocessor (year, path_to_dir='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales', path_to_geo_file='/Users/Paul-Emmanuel/Desktop/destination_based_sales/destination_based_sales/data/geographies.csv')

The instructions allowing to load and preprocess IRS data are organised in a Python class, IRSDataPreprocessor.

This is the instantiation function for this class. It requires several arguments:

the year to consider (for now, one of 2016, 2017 or 2018);
the path to the directory where this Python file is located, to retrieve the appropriate data file;
the path to the "geographies.csv" file, used for instance to complement IRS data with country codes.

Expand source code

class IRSDataPreprocessor:

    def __init__(
        self,
        year,
        path_to_dir=path_to_dir,
        path_to_geo_file=path_to_geographies
    ):
        """
        The instructions allowing to load and preprocess IRS data are organised in a Python class, IRSDataPreprocessor.

        This is the instantiation function for this class. It requires several arguments:

        - the year to consider (for now, one of 2016, 2017 or 2018);
        - the path to the directory where this Python file is located, to retrieve the appropriate data file;
        - the path to the "geographies.csv" file, used for instance to complement IRS data with country codes.
        """
        self.year = year

        # We reconstruct the path to the relevant Excel file, which depends on the year considered
        self.path_to_cbcr = os.path.join(
            path_to_dir,
            'data',
            str(year),
            f'{year - 2000}it01acbc.xlsx'
        )

        self.path_to_geo_file = path_to_geographies

        self.CODES_TO_IMPUTE = CODES_TO_IMPUTE_IRS.copy()
        self.UK_CARIBBEAN_ISLANDS = UK_CARIBBEAN_ISLANDS.copy()

    def load_data(self):
        """
        This class method is used to load and clean the data from the IRS. It relies on the data file paths, saved as
        class attributes when the instantiation function is called. Preprocessing steps are detailed in comments below.
        """

        # We load the data from the appropriate Excel file
        irs = pd.read_excel(
            self.path_to_cbcr,
            engine='openpyxl'
        )

        # We eliminate irrelevant columns and rename the appropriate ones
        irs.drop(
            columns=['Unnamed: 1'] + list(irs.columns[5:]),
            inplace=True
        )

        irs.columns = [
            'AFFILIATE_COUNTRY_NAME', 'UNRELATED_PARTY_REVENUES', 'RELATED_PARTY_REVENUES', 'TOTAL_REVENUES'
        ]

        # We filter out irrelevant rows
        # In particular, we eliminate rows corresponding to stateless entities and continental totals
        irs = irs.loc[5:].copy()

        mask_stateless = irs['AFFILIATE_COUNTRY_NAME'].map(
            lambda country_name: 'stateless' in country_name.lower()
        )
        mask_total = irs['AFFILIATE_COUNTRY_NAME'].map(
            lambda country_name: 'total' in country_name.lower()
        )
        mask = ~np.logical_or(
            mask_stateless, mask_total
        )

        irs = irs[mask].copy()

        irs = irs.iloc[:-4].copy()

        # We rename fields of the form "Other [+ CONTINENT_NAME]"
        irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
            (
                lambda country_name: ('Other ' + country_name.split(',')[0]).replace('&', 'and')
                if 'other' in country_name.lower() else country_name
            )
        )

        # We deal with a few specific country names
        irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
            lambda country_name: 'United Kingdom' if 'United Kingdom' in country_name else country_name
        )
        irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
            lambda country_name: 'Korea' if country_name.startswith('Korea') else country_name
        )
        irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
            lambda country_name: 'Congo' if country_name.endswith('(Brazzaville)') else country_name
        )

        irs.reset_index(drop=True, inplace=True)

        return irs.copy()

    def load_data_with_geo_codes(self):
        """
        This class method is used to add geographical ISO codes to the raw dataset, loaded with the "load_data" method.
        It relies on the "impute_missing_codes" function defined in utils.py.
        """
        irs = self.load_data()

        geographies = pd.read_csv(self.path_to_geo_file)

        # We merge the DataFrame containing raw IRS data with the one containing the ISO code correspondences
        merged_df = irs.merge(
            geographies,
            how='left',
            left_on='AFFILIATE_COUNTRY_NAME', right_on='NAME'
        )

        # We add missing codes
        for column in ['NAME', 'CODE', 'CONTINENT_NAME', 'CONTINENT_CODE']:
            merged_df[column] = merged_df.apply(
                lambda row: impute_missing_codes(
                    row=row,
                    column=column,
                    codes_to_impute=self.CODES_TO_IMPUTE
                ),
                axis=1
            )

        merged_df.drop(columns=['NAME'], inplace=True)

        return merged_df.copy()

    def load_data_with_UKI(self):
        """
        To match the BEA data, that show a "United Kingdom Islands, Caribbean" aggregate for all the related jurisdi-
        ctions, we do the same aggregation in the IRS data. Basically, we sum unrelated-party, related-party and total
        revenues for all the countries concerned and associate the totals to a new "United Kingdom Islands, Caribbean"
        affiliate country name. We then eliminate fields included in the aggregation.
        """
        irs = self.load_data_with_geo_codes()

        extract = irs[irs['CODE'].isin(self.UK_CARIBBEAN_ISLANDS)].copy()

        irs = irs[~irs['CODE'].isin(self.UK_CARIBBEAN_ISLANDS)].copy()

        irs.reset_index(inplace=True, drop=True)

        dict_df = irs.to_dict()

        dict_df[irs.columns[0]][len(irs)] = 'United Kingdom Islands, Caribbean'
        dict_df[irs.columns[1]][len(irs)] = extract['UNRELATED_PARTY_REVENUES'].sum()
        dict_df[irs.columns[2]][len(irs)] = extract['RELATED_PARTY_REVENUES'].sum()
        dict_df[irs.columns[3]][len(irs)] = extract['TOTAL_REVENUES'].sum()
        dict_df[irs.columns[4]][len(irs)] = 'UKI'
        dict_df[irs.columns[5]][len(irs)] = 'America'
        dict_df[irs.columns[6]][len(irs)] = 'AMR'

        irs = pd.DataFrame.from_dict(dict_df)

        return irs.copy()

    def load_final_data(self):
        """
        This class method allows to load the fully preprocessed IRS data. Relying on the "load_data_with_UKI" method,
        continent names and codes are limited to 4 pairs, corresponding respectively to Europe, Africa, America (North
        and South) and Asia-Pacific (gathering Asia and Oceania).
        """
        irs = self.load_data_with_UKI()

        irs['CONTINENT_CODE'] = irs['CONTINENT_CODE'].map(
            lambda x: 'AMR' if x in ['SAMR', 'NAMR'] else x
        )

        irs['CONTINENT_NAME'] = irs['CONTINENT_NAME'].map(
            lambda x: 'America' if x in ['South America', 'North America'] else x
        )

        irs['CONTINENT_CODE'] = irs['CONTINENT_CODE'].map(
            lambda x: 'APAC' if x in ['ASIA', 'OCN'] or x is None else x
        )

        irs['CONTINENT_NAME'] = irs['CONTINENT_NAME'].map(
            lambda x: 'Asia-Pacific' if x in ['Asia', 'Oceania'] or x is None else x
        )

        return irs.copy()

Methods

def load_data(self)

This class method is used to load and clean the data from the IRS. It relies on the data file paths, saved as class attributes when the instantiation function is called. Preprocessing steps are detailed in comments below.

Expand source code

def load_data(self):
    """
    This class method is used to load and clean the data from the IRS. It relies on the data file paths, saved as
    class attributes when the instantiation function is called. Preprocessing steps are detailed in comments below.
    """

    # We load the data from the appropriate Excel file
    irs = pd.read_excel(
        self.path_to_cbcr,
        engine='openpyxl'
    )

    # We eliminate irrelevant columns and rename the appropriate ones
    irs.drop(
        columns=['Unnamed: 1'] + list(irs.columns[5:]),
        inplace=True
    )

    irs.columns = [
        'AFFILIATE_COUNTRY_NAME', 'UNRELATED_PARTY_REVENUES', 'RELATED_PARTY_REVENUES', 'TOTAL_REVENUES'
    ]

    # We filter out irrelevant rows
    # In particular, we eliminate rows corresponding to stateless entities and continental totals
    irs = irs.loc[5:].copy()

    mask_stateless = irs['AFFILIATE_COUNTRY_NAME'].map(
        lambda country_name: 'stateless' in country_name.lower()
    )
    mask_total = irs['AFFILIATE_COUNTRY_NAME'].map(
        lambda country_name: 'total' in country_name.lower()
    )
    mask = ~np.logical_or(
        mask_stateless, mask_total
    )

    irs = irs[mask].copy()

    irs = irs.iloc[:-4].copy()

    # We rename fields of the form "Other [+ CONTINENT_NAME]"
    irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
        (
            lambda country_name: ('Other ' + country_name.split(',')[0]).replace('&', 'and')
            if 'other' in country_name.lower() else country_name
        )
    )

    # We deal with a few specific country names
    irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
        lambda country_name: 'United Kingdom' if 'United Kingdom' in country_name else country_name
    )
    irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
        lambda country_name: 'Korea' if country_name.startswith('Korea') else country_name
    )
    irs['AFFILIATE_COUNTRY_NAME'] = irs['AFFILIATE_COUNTRY_NAME'].map(
        lambda country_name: 'Congo' if country_name.endswith('(Brazzaville)') else country_name
    )

    irs.reset_index(drop=True, inplace=True)

    return irs.copy()

def load_data_with_UKI(self)

To match the BEA data, that show a "United Kingdom Islands, Caribbean" aggregate for all the related jurisdi- ctions, we do the same aggregation in the IRS data. Basically, we sum unrelated-party, related-party and total revenues for all the countries concerned and associate the totals to a new "United Kingdom Islands, Caribbean" affiliate country name. We then eliminate fields included in the aggregation.

Expand source code

def load_data_with_UKI(self):
    """
    To match the BEA data, that show a "United Kingdom Islands, Caribbean" aggregate for all the related jurisdi-
    ctions, we do the same aggregation in the IRS data. Basically, we sum unrelated-party, related-party and total
    revenues for all the countries concerned and associate the totals to a new "United Kingdom Islands, Caribbean"
    affiliate country name. We then eliminate fields included in the aggregation.
    """
    irs = self.load_data_with_geo_codes()

    extract = irs[irs['CODE'].isin(self.UK_CARIBBEAN_ISLANDS)].copy()

    irs = irs[~irs['CODE'].isin(self.UK_CARIBBEAN_ISLANDS)].copy()

    irs.reset_index(inplace=True, drop=True)

    dict_df = irs.to_dict()

    dict_df[irs.columns[0]][len(irs)] = 'United Kingdom Islands, Caribbean'
    dict_df[irs.columns[1]][len(irs)] = extract['UNRELATED_PARTY_REVENUES'].sum()
    dict_df[irs.columns[2]][len(irs)] = extract['RELATED_PARTY_REVENUES'].sum()
    dict_df[irs.columns[3]][len(irs)] = extract['TOTAL_REVENUES'].sum()
    dict_df[irs.columns[4]][len(irs)] = 'UKI'
    dict_df[irs.columns[5]][len(irs)] = 'America'
    dict_df[irs.columns[6]][len(irs)] = 'AMR'

    irs = pd.DataFrame.from_dict(dict_df)

    return irs.copy()

def load_data_with_geo_codes(self)

This class method is used to add geographical ISO codes to the raw dataset, loaded with the "load_data" method. It relies on the "impute_missing_codes" function defined in utils.py.

Expand source code

def load_data_with_geo_codes(self):
    """
    This class method is used to add geographical ISO codes to the raw dataset, loaded with the "load_data" method.
    It relies on the "impute_missing_codes" function defined in utils.py.
    """
    irs = self.load_data()

    geographies = pd.read_csv(self.path_to_geo_file)

    # We merge the DataFrame containing raw IRS data with the one containing the ISO code correspondences
    merged_df = irs.merge(
        geographies,
        how='left',
        left_on='AFFILIATE_COUNTRY_NAME', right_on='NAME'
    )

    # We add missing codes
    for column in ['NAME', 'CODE', 'CONTINENT_NAME', 'CONTINENT_CODE']:
        merged_df[column] = merged_df.apply(
            lambda row: impute_missing_codes(
                row=row,
                column=column,
                codes_to_impute=self.CODES_TO_IMPUTE
            ),
            axis=1
        )

    merged_df.drop(columns=['NAME'], inplace=True)

    return merged_df.copy()

def load_final_data(self)

This class method allows to load the fully preprocessed IRS data. Relying on the "load_data_with_UKI" method, continent names and codes are limited to 4 pairs, corresponding respectively to Europe, Africa, America (North and South) and Asia-Pacific (gathering Asia and Oceania).

Expand source code

def load_final_data(self):
    """
    This class method allows to load the fully preprocessed IRS data. Relying on the "load_data_with_UKI" method,
    continent names and codes are limited to 4 pairs, corresponding respectively to Europe, Africa, America (North
    and South) and Asia-Pacific (gathering Asia and Oceania).
    """
    irs = self.load_data_with_UKI()

    irs['CONTINENT_CODE'] = irs['CONTINENT_CODE'].map(
        lambda x: 'AMR' if x in ['SAMR', 'NAMR'] else x
    )

    irs['CONTINENT_NAME'] = irs['CONTINENT_NAME'].map(
        lambda x: 'America' if x in ['South America', 'North America'] else x
    )

    irs['CONTINENT_CODE'] = irs['CONTINENT_CODE'].map(
        lambda x: 'APAC' if x in ['ASIA', 'OCN'] or x is None else x
    )

    irs['CONTINENT_NAME'] = irs['CONTINENT_NAME'].map(
        lambda x: 'Asia-Pacific' if x in ['Asia', 'Oceania'] or x is None else x
    )

    return irs.copy()