Skip to content

Commit ab58dbb

Browse files
committed
chore: add logging
1 parent 182e71c commit ab58dbb

File tree

3 files changed

+46
-4
lines changed

3 files changed

+46
-4
lines changed

main.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import pickle
22
import tempfile
3+
import logging
34

45
from scystream.sdk.core import entrypoint
56
from scystream.sdk.env.settings import (
@@ -14,6 +15,13 @@
1415
from preprocessing.loader import TxtLoader, BibLoader
1516

1617

18+
logging.basicConfig(
19+
level=logging.INFO,
20+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
21+
)
22+
logger = logging.getLogger(__name__)
23+
24+
1725
class DTMFileOutput(FileSettings, OutputSettings):
1826
__identifier__ = "dtm_output"
1927

@@ -66,6 +74,8 @@ class PreprocessBIB(EnvSettings):
6674

6775
def _preprocess_and_store(texts, settings):
6876
"""Shared preprocessing logic for TXT and BIB."""
77+
logger.info(f"Starting preprocessing with {len(texts)} documents")
78+
6979
pre = Preprocessor(
7080
language=settings.LANGUAGE,
7181
filter_stopwords=settings.FILTER_STOPWORDS,
@@ -74,10 +84,12 @@ def _preprocess_and_store(texts, settings):
7484
ngram_min=settings.NGRAM_MIN,
7585
ngram_max=settings.NGRAM_MAX,
7686
)
77-
pre.texts = texts
7887

88+
pre.texts = texts
7989
pre.analyze_texts()
90+
8091
pre.generate_bag_of_words()
92+
8193
dtm, vocab = pre.generate_document_term_matrix()
8294

8395
with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
@@ -89,20 +101,30 @@ def _preprocess_and_store(texts, settings):
89101
pickle.dump(vocab, tmp_vocab)
90102
tmp_vocab.flush()
91103

104+
logger.info("Uploading DTM to S3...")
92105
S3Operations.upload(settings.dtm_output, tmp_dtm.name)
106+
107+
logger.info("Uploading vocabulary to S3...")
93108
S3Operations.upload(settings.vocab_output, tmp_vocab.name)
94109

110+
logger.info("Preprocessing completed successfully.")
111+
95112

96113
@entrypoint(PreprocessTXT)
97114
def preprocess_txt_file(settings):
115+
logger.info("Downloading TXT input from S3...")
98116
S3Operations.download(settings.txt_input, "input.txt")
117+
99118
texts = TxtLoader.load("./input.txt")
119+
100120
_preprocess_and_store(texts, settings)
101121

102122

103123
@entrypoint(PreprocessBIB)
104124
def preprocess_bib_file(settings):
125+
logger.info("Downloading BIB input from S3...")
105126
S3Operations.download(settings.bib_input, "input.bib")
127+
106128
texts = BibLoader.load(
107129
"./input.bib",
108130
attribute=settings.bib_input.SELECTED_ATTRIBUTE,

preprocessing/core.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import spacy
23
import numpy as np
34

@@ -9,6 +10,7 @@
910
"en": "en_core_web_sm",
1011
"de": "de_core_news_sm"
1112
}
13+
logger = logging.getLogger(__name__)
1214

1315

1416
class Preprocessor:
@@ -21,6 +23,12 @@ def __init__(
2123
ngram_min: int = 2,
2224
ngram_max: int = 3,
2325
):
26+
logger.info(
27+
"Init Preprocessor (lang=%s, filter_stopwords=%s, ngrams=%s)",
28+
language,
29+
filter_stopwords,
30+
use_ngrams,
31+
)
2432
self.language = language
2533
self.filter_stopwords = filter_stopwords
2634
self.unigram_normalizer = unigram_normalizer
@@ -58,6 +66,7 @@ def filter_tokens(
5866
]
5967

6068
def analyze_texts(self):
69+
logger.info(f"Analyzing {len(self.texts)} texts...")
6170
porter = PorterStemmer()
6271
for text in self.texts:
6372
doc = self.nlp(text)
@@ -67,8 +76,8 @@ def analyze_texts(self):
6776

6877
for sentence in doc.sents:
6978
filtered_tokens = self.filter_tokens(
70-
list(sentence),
71-
self.filter_stopwords
79+
list(sentence),
80+
self.filter_stopwords
7281
)
7382
normalized_tokens = [
7483
self.normalize_token(t, porter) for t in filtered_tokens
@@ -93,6 +102,10 @@ def analyze_texts(self):
93102
if ngram_list:
94103
self.ngram_frequency.update(ngram_list)
95104
self.ngram_document_frequency.update(set(ngram_list))
105+
logger.info(
106+
f"Finished analyzing texts: {self.token_frequency} unigrams, {
107+
self.ngram_frequency} n-grams",
108+
)
96109

97110
def normalize_token(
98111
self,
@@ -110,6 +123,7 @@ def normalize_token(
110123
return word
111124

112125
def generate_bag_of_words(self):
126+
logger.info("Generating bag-of-words...")
113127
porter = PorterStemmer()
114128
self.bag_of_words = []
115129

@@ -177,7 +191,7 @@ def generate_document_term_matrix(self) -> (np.ndarray, dict):
177191
dtm (np.ndarray): shape = (num_docs, num_terms)
178192
vocab (dict): mapping term -> column index
179193
"""
180-
194+
logger.info("Building document-term-matrix...")
181195
all_terms = set()
182196
for doc in self.bag_of_words:
183197
for t in doc:
@@ -194,4 +208,5 @@ def generate_document_term_matrix(self) -> (np.ndarray, dict):
194208
term_idx = vocab[token["term"]]
195209
dtm[doc_idx, term_idx] += 1
196210

211+
logger.info(f"Matrix shape: {dtm.shape} | Vocab size: {len(vocab)}")
197212
return dtm, vocab

preprocessing/loader.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import logging
12
import re
23
import bibtexparser
34

5+
logger = logging.getLogger(__name__)
6+
47

58
def normalize_text(text: str) -> str:
69
if not text:
@@ -24,6 +27,7 @@ def normalize_text(text: str) -> str:
2427
class TxtLoader:
2528
@staticmethod
2629
def load(file_path: str) -> list[str]:
30+
logger.info("Loading TXT file...")
2731
with open(file_path, "r", encoding="utf-8") as f:
2832
lines = f.readlines()
2933
return [normalize_text(line) for line in lines]
@@ -32,6 +36,7 @@ def load(file_path: str) -> list[str]:
3236
class BibLoader:
3337
@staticmethod
3438
def load(file_path: str, attribute: str) -> list[str]:
39+
logger.info(f"Loading BIB file (attribute={attribute})...")
3540
with open(file_path, "r", encoding="utf-8") as f:
3641
bib_database = bibtexparser.load(f)
3742

0 commit comments

Comments
 (0)