Skip to content

Commit e822eab

Browse files
committed
fix: use absolute path for file download
1 parent ab58dbb commit e822eab

File tree

5 files changed

+37
-126
lines changed

5 files changed

+37
-126
lines changed

.github/workflows/ci.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,28 @@ jobs:
3838
- name: Intall dependencies
3939
run: |
4040
pip install -r requirements.txt
41+
42+
- name: Debug directory structure
43+
run: |
44+
pwd
45+
ls -R .
46+
47+
- name: Print loaded CBCs
48+
run: |
49+
python3 - <<'EOF'
50+
from scystream.sdk.config import load_config, get_compute_block
51+
from pathlib import Path
52+
import pprint
53+
54+
block_from_code = get_compute_block()
55+
block_from_yaml = load_config("cbc.yaml")
56+
57+
print("Code block:")
58+
pprint.pprint(block_from_code)
59+
60+
print("YAML block:")
61+
pprint.pprint(block_from_yaml)
62+
EOF
4163
4264
- name: Check cbcs
4365
run: |

cbc.yaml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
author: Paul Kalhorn
22
description: Language preprocessing for .txt or .bib files
3-
docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
3+
docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
44
entrypoints:
55
preprocess_bib_file:
6-
description: Entrypoint for preprocessing a .bib file
6+
description: Entrypoint for preprocessing a .bib file
77
envs:
8+
BIB_DOWNLOAD_PATH: /tmp/input.bib
89
FILTER_STOPWORDS: true
910
LANGUAGE: en
1011
NGRAM_MAX: 3
@@ -23,7 +24,7 @@ entrypoints:
2324
bib_file_S3_PORT: null
2425
bib_file_S3_SECRET_KEY: null
2526
bib_file_SELECTED_ATTRIBUTE: Abstract
26-
description: The bib file, aswell as one attribute selected for preprocessing
27+
description: The bib file, aswell as one attribute selected for preprocessing
2728
type: file
2829
outputs:
2930
dtm_output:
@@ -36,7 +37,7 @@ entrypoints:
3637
dtm_output_S3_HOST: null
3738
dtm_output_S3_PORT: null
3839
dtm_output_S3_SECRET_KEY: null
39-
description: Numpy representation of document-term matrix as .pkl file
40+
description: Numpy representation of document-term matrix as .pkl file
4041
type: file
4142
vocab_output:
4243
config:
@@ -57,6 +58,7 @@ entrypoints:
5758
LANGUAGE: en
5859
NGRAM_MAX: 3
5960
NGRAM_MIN: 2
61+
TXT_DOWNLOAD_PATH: /tmp/input.txt
6062
UNIGRAM_NORMALIZER: porter
6163
USE_NGRAMS: true
6264
inputs:
@@ -70,7 +72,7 @@ entrypoints:
7072
txt_file_S3_HOST: null
7173
txt_file_S3_PORT: null
7274
txt_file_S3_SECRET_KEY: null
73-
description: A .txt file
75+
description: A .txt file
7476
type: file
7577
outputs:
7678
dtm_output:

input.bib

Lines changed: 0 additions & 112 deletions
This file was deleted.

input.txt

Lines changed: 0 additions & 4 deletions
This file was deleted.

main.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from preprocessing.core import Preprocessor
1515
from preprocessing.loader import TxtLoader, BibLoader
1616

17-
1817
logging.basicConfig(
1918
level=logging.INFO,
2019
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
@@ -54,6 +53,8 @@ class PreprocessTXT(EnvSettings):
5453
NGRAM_MIN: int = 2
5554
NGRAM_MAX: int = 3
5655

56+
TXT_DOWNLOAD_PATH: str = "/tmp/input.txt"
57+
5758
txt_input: TXTFileInput
5859
dtm_output: DTMFileOutput
5960
vocab_output: VocabFileOutput
@@ -67,6 +68,8 @@ class PreprocessBIB(EnvSettings):
6768
NGRAM_MIN: int = 2
6869
NGRAM_MAX: int = 3
6970

71+
BIB_DOWNLOAD_PATH: str = "/tmp/input.bib"
72+
7073
bib_input: BIBFileInput
7174
dtm_output: DTMFileOutput
7275
vocab_output: VocabFileOutput
@@ -113,20 +116,20 @@ def _preprocess_and_store(texts, settings):
113116
@entrypoint(PreprocessTXT)
114117
def preprocess_txt_file(settings):
115118
logger.info("Downloading TXT input from S3...")
116-
S3Operations.download(settings.txt_input, "input.txt")
119+
S3Operations.download(settings.txt_input, settings.TXT_DOWNLOAD_PATH)
117120

118-
texts = TxtLoader.load("./input.txt")
121+
texts = TxtLoader.load(settings.TXT_DOWNLOAD_PATH)
119122

120123
_preprocess_and_store(texts, settings)
121124

122125

123126
@entrypoint(PreprocessBIB)
124127
def preprocess_bib_file(settings):
125128
logger.info("Downloading BIB input from S3...")
126-
S3Operations.download(settings.bib_input, "input.bib")
129+
S3Operations.download(settings.bib_input, settings.BIB_DOWNLOAD_PATH)
127130

128131
texts = BibLoader.load(
129-
"./input.bib",
132+
settings.BIB_DOWNLOAD_PATH,
130133
attribute=settings.bib_input.SELECTED_ATTRIBUTE,
131134
)
132135
_preprocess_and_store(texts, settings)

0 commit comments

Comments
 (0)