11import pickle
22import tempfile
3+ import logging
34
45from scystream .sdk .core import entrypoint
56from scystream .sdk .env .settings import (
1415from preprocessing .loader import TxtLoader , BibLoader
1516
1617
18+ logging .basicConfig (
19+ level = logging .INFO ,
20+ format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
21+ )
22+ logger = logging .getLogger (__name__ )
23+
24+
1725class DTMFileOutput (FileSettings , OutputSettings ):
1826 __identifier__ = "dtm_output"
1927
@@ -66,6 +74,8 @@ class PreprocessBIB(EnvSettings):
6674
6775def _preprocess_and_store (texts , settings ):
6876 """Shared preprocessing logic for TXT and BIB."""
77+ logger .info (f"Starting preprocessing with { len (texts )} documents" )
78+
6979 pre = Preprocessor (
7080 language = settings .LANGUAGE ,
7181 filter_stopwords = settings .FILTER_STOPWORDS ,
@@ -74,10 +84,12 @@ def _preprocess_and_store(texts, settings):
7484 ngram_min = settings .NGRAM_MIN ,
7585 ngram_max = settings .NGRAM_MAX ,
7686 )
77- pre .texts = texts
7887
88+ pre .texts = texts
7989 pre .analyze_texts ()
90+
8091 pre .generate_bag_of_words ()
92+
8193 dtm , vocab = pre .generate_document_term_matrix ()
8294
8395 with tempfile .NamedTemporaryFile (suffix = "_dtm.pkl" ) as tmp_dtm , \
@@ -89,20 +101,30 @@ def _preprocess_and_store(texts, settings):
89101 pickle .dump (vocab , tmp_vocab )
90102 tmp_vocab .flush ()
91103
104+ logger .info ("Uploading DTM to S3..." )
92105 S3Operations .upload (settings .dtm_output , tmp_dtm .name )
106+
107+ logger .info ("Uploading vocabulary to S3..." )
93108 S3Operations .upload (settings .vocab_output , tmp_vocab .name )
94109
110+ logger .info ("Preprocessing completed successfully." )
111+
95112
96113@entrypoint (PreprocessTXT )
97114def preprocess_txt_file (settings ):
115+ logger .info ("Downloading TXT input from S3..." )
98116 S3Operations .download (settings .txt_input , "input.txt" )
117+
99118 texts = TxtLoader .load ("./input.txt" )
119+
100120 _preprocess_and_store (texts , settings )
101121
102122
103123@entrypoint (PreprocessBIB )
104124def preprocess_bib_file (settings ):
125+ logger .info ("Downloading BIB input from S3..." )
105126 S3Operations .download (settings .bib_input , "input.bib" )
127+
106128 texts = BibLoader .load (
107129 "./input.bib" ,
108130 attribute = settings .bib_input .SELECTED_ATTRIBUTE ,
0 commit comments