clean_text(text)

Clean text (lower, puntuations removal, blank space removal).

newsclassifier\data.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def clean_text(text: str) -> str:
    """Clean text (lower, puntuations removal, blank space removal)."""
    # lower case the text
    logger.info("Cleaning input text.")
    text = text.lower()  # necessary to do before as stopwords are in lower case

    # remove stopwords
    stp_pattern = re.compile(r"\b(" + r"|".join(Cfg.STOPWORDS) + r")\b\s*")
    text = stp_pattern.sub("", text)

    # custom cleaning
    text = text.strip()  # remove space at start or end if any
    text = re.sub(" +", " ", text)  # remove extra spaces
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove characters that are not alphanumeric

    return text

collate(inputs)

Collate and modify the input dictionary to have the same sequence length for a particular input batch.

Parameters:
  • inputs (dict) –

    A dictionary containing input tensors with varying sequence lengths.

Returns:
  • modified_inputs( dict ) –

    A modified dictionary with input tensors trimmed to have the same sequence length.

newsclassifier\data.py
175
176
177
178
179
180
181
182
183
184
185
186
187
def collate(inputs: Dict) -> Dict:
    """Collate and modify the input dictionary to have the same sequence length for a particular input batch.

    Args:
        inputs (dict): A dictionary containing input tensors with varying sequence lengths.

    Returns:
        modified_inputs (dict): A modified dictionary with input tensors trimmed to have the same sequence length.
    """
    max_len = int(inputs["input_ids"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :max_len]
    return inputs

data_split(df, split_size=0.2, stratify_on_target=True, save_dfs=False)

Split data into train and test sets.

Parameters:
  • df (DataFrame) –

    Data to be split.

  • split_size (float, default: 0.2 ) –

    train-test split ratio (test ratio).

  • stratify_on_target (bool, default: True ) –

    Whether to do stratify split on target.

  • target_sep (bool) –

    Whether to do target setting for train and test sets.

  • save_dfs (bool, default: False ) –

    Whether to save dataset splits in artifacts.

Returns:
  • train-test splits (with/without target setting)

newsclassifier\data.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def data_split(df: pd.DataFrame, split_size: float = 0.2, stratify_on_target: bool = True, save_dfs: bool = False):
    """Split data into train and test sets.

    Args:
        df (pd.DataFrame): Data to be split.
        split_size (float): train-test split ratio (test ratio).
        stratify_on_target (bool): Whether to do stratify split on target.
        target_sep (bool): Whether to do target setting for train and test sets.
        save_dfs (bool): Whether to save dataset splits in artifacts.

    Returns:
        train-test splits (with/without target setting)
    """
    logger.info("Splitting Data.")
    try:
        if stratify_on_target:
            stra = df["Category"]
        else:
            stra = None

        train, test = train_test_split(df, test_size=split_size, random_state=42, stratify=stra)
        train_ds = pd.DataFrame(train, columns=df.columns)
        test_ds = pd.DataFrame(test, columns=df.columns)

        if save_dfs:
            logger.info("Saving and storing data splits.")

            os.makedirs(Cfg.preprocessed_data_path, exist_ok=True)
            train.to_csv(os.path.join(Cfg.preprocessed_data_path, "train.csv"))
            test.to_csv(os.path.join(Cfg.preprocessed_data_path, "test.csv"))
    except Exception as e:
        logger.error(e)

        return train_ds, test_ds

load_dataset(filepath, print_i=0)

load data from source into a Pandas DataFrame.

Parameters:
  • filepath (str) –

    file location.

  • print_i (int, default: 0 ) –

    Print number of instances.

Returns:
  • DataFrame

    pd.DataFrame: Pandas DataFrame of the data.

newsclassifier\data.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def load_dataset(filepath: str, print_i: int = 0) -> pd.DataFrame:
    """load data from source into a Pandas DataFrame.

    Args:
        filepath (str): file location.
        print_i (int): Print number of instances.

    Returns:
        pd.DataFrame: Pandas DataFrame of the data.
    """
    logger.info("Loading Data.")
    df = pd.read_csv(filepath)
    if print_i:
        print(df.head(print_i), "\n")
    return df

prepare_data(df)

Separate headlines instance and feature selection.

Parameters:
  • df (DataFrame) –

    original dataframe.

Returns:
  • df( DataFrame ) –

    new dataframe with appropriate features.

  • headlines_df( DataFrame ) –

    dataframe cintaining "headlines" category instances.

newsclassifier\data.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def prepare_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Separate headlines instance and feature selection.

    Args:
        df: original dataframe.

    Returns:
        df: new dataframe with appropriate features.
        headlines_df: dataframe cintaining "headlines" category instances.
    """
    logger.info("Preparing Data.")
    try:
        df = df[["Title", "Category"]]
        df.rename(columns={"Title": "Text"}, inplace=True)
        df, headlines_df = df[df["Category"] != "Headlines"].reset_index(drop=True), df[df["Category"] == "Headlines"].reset_index(drop=True)
    except Exception as e:
        logger.error(e)

    return df, headlines_df

prepare_input(tokenizer, text)

Tokenize and prepare the input text using the provided tokenizer.

Parameters:
  • tokenizer (RobertaTokenizer) –

    The Roberta tokenizer to encode the input.

  • text (str) –

    The input text to be tokenized.

Returns:
  • inputs( dict ) –

    A dictionary containing the tokenized input with keys such as 'input_ids', 'attention_mask', etc.

newsclassifier\data.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def prepare_input(tokenizer: RobertaTokenizer, text: str) -> Dict:
    """Tokenize and prepare the input text using the provided tokenizer.

    Args:
        tokenizer (RobertaTokenizer): The Roberta tokenizer to encode the input.
        text (str): The input text to be tokenized.

    Returns:
        inputs (dict): A dictionary containing the tokenized input with keys such as 'input_ids',
            'attention_mask', etc.
    """
    logger.info("Tokenizing input text.")
    inputs = tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=Cfg.add_special_tokens,
        max_length=Cfg.max_len,
        pad_to_max_length=Cfg.pad_to_max_length,
        truncation=Cfg.truncation,
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

preprocess(df)

Preprocess the data.

Parameters:
  • df (DataFrame) –

    Dataframe on which the preprocessing steps need to be performed.

Returns:
  • df( DataFrame ) –

    Preprocessed Data.

  • class_to_index( DataFrame ) –

    class labels to indices mapping

  • class_to_index( Dict ) –

    indices to class labels mapping

newsclassifier\data.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def preprocess(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, Dict, Dict]:
    """Preprocess the data.

    Args:
        df: Dataframe on which the preprocessing steps need to be performed.

    Returns:
        df: Preprocessed Data.
        class_to_index: class labels to indices mapping
        class_to_index: indices to class labels mapping
    """
    df, headlines_df = prepare_data(df)

    cats = df["Category"].unique().tolist()
    class_to_index = {tag: i for i, tag in enumerate(cats)}
    index_to_class = {v: k for k, v in class_to_index.items()}

    df["Text"] = df["Text"].apply(clean_text)  # clean text
    df = df[["Text", "Category"]]
    try:
        df["Category"] = df["Category"].map(class_to_index)  # label encoding
    except Exception as e:
        logger.error(e)
    return df, headlines_df, class_to_index, index_to_class