diff --git a/amazon/ion/__init__.py b/amazon/ion/__init__.py index 16c949857..d9d99497a 100644 --- a/amazon/ion/__init__.py +++ b/amazon/ion/__init__.py @@ -36,4 +36,5 @@ 'writer_binary_raw_fields', 'writer_buffer', 'writer_text', + 'lazy_type', ] diff --git a/amazon/ion/ioncmodule.c b/amazon/ion/ioncmodule.c index 603ba1134..99bb58dd4 100644 --- a/amazon/ion/ioncmodule.c +++ b/amazon/ion/ioncmodule.c @@ -30,7 +30,7 @@ static char _err_msg[ERR_MSG_MAX_LEN]; // Python 2/3 compatibility #if PY_MAJOR_VERSION >= 3 #define IONC_BYTES_FORMAT "y#" - #define IONC_READ_ARGS_FORMAT "OO" + #define IONC_READ_ARGS_FORMAT "OOO" #define PyInt_AsSsize_t PyLong_AsSsize_t #define PyInt_AsLong PyLong_AsLong #define PyInt_FromLong PyLong_FromLong @@ -41,7 +41,7 @@ static char _err_msg[ERR_MSG_MAX_LEN]; #define PyInt_Check PyLong_Check #else #define IONC_BYTES_FORMAT "s#" - #define IONC_READ_ARGS_FORMAT "OOO" + #define IONC_READ_ARGS_FORMAT "OOOO" #endif #if PY_VERSION_HEX < 0x02070000 @@ -53,6 +53,8 @@ static PyObject* _math_module; static PyObject* _decimal_module; static PyObject* _decimal_constructor; static PyObject* _py_timestamp_constructor; +static PyObject* _lazytype_module; +static PyObject* _ionpylazyobj_cls; static PyObject* _simpletypes_module; static PyObject* _ionpynull_cls; static PyObject* _ionpynull_fromvalue; @@ -792,6 +794,28 @@ iERR ionc_write_value(hWRITER writer, PyObject* obj, PyObject* tuple_as_sexp) { IONCHECK(ionc_write_sequence(writer, obj, tuple_as_sexp)); IONCHECK(ion_writer_finish_container(writer)); } + /* When we are going to serialize a lazy Ion object, we should check if this object holds a cached binary + representation already. If so, write the cached bytes into writer's temporary buffer directly. However I didn't find + an API in ion-c's ion_writer.h to append arbitrary bytes in writer's temporary buffer (correct me if I missed it) so + I write an IonBlob value instead, which is wrong! It's only used for visible and debugging. Later on we should decide + if/how to expose the existing API in ion-c for CPython layer so that it's able to append custom bytes into writer's + value_stream instead of only be able to call the writer to writes bytes. + */ + else if (PyObject_IsInstance(obj, _ionpylazyobj_cls)) { + Py_ssize_t len; + char* c_buf; + + // TODO check the existence of lazy buffer first. Since all lazy objects hold a buffer by my sample code... I skipped validation + PyObject* lazy_buffer = PyObject_GetAttrString(obj, "lazy_buffer"); + + // Convert python bytes to C bytes + if (PyBytes_AsStringAndSize(lazy_buffer, &c_buf, &len) < 0) { + _FAILWITHMSG(IERR_INVALID_ARG, "Binary conversion error."); + } + // TODO should append the c_buf to _value_stream directly. Not an Ion BLOB! + // E.g. Something like ION_PUT(writer->_typed_writer.binary._value_stream, c_buf); + IONCHECK(ion_writer_write_blob(writer, (BYTE*)c_buf, len)); + } else { _FAILWITHMSG(IERR_INVALID_STATE, "Cannot dump arbitrary object types."); } @@ -1503,6 +1527,49 @@ void ionc_read_iter_dealloc(PyObject *self) { PyObject_Del(self); } +iERR ionc_lazy_read_all(hREADER hreader, PyObject* container, BOOL in_struct, BOOL emit_bare_values, char* buffer) { + iENTER; + ION_TYPE t; + for (;;) { + IONCHECK(ion_reader_next(hreader, &t)); + if (t == tid_EOF) { + assert(t == tid_EOF && "next() at end"); + break; + } + PyObject* rtn; + PyObject* py_cached_bytes; + + // Calculate the start position and the length of the value that writer is stand on so we can cache this bytes + // Start position + POSITION p_offset = 0; + // Value length + SIZE p_length = 0; + ion_reader_get_value_offset(hreader, &p_offset); + ion_reader_get_value_length(hreader, &p_length); + // debug + printf("p_offset is: %" PRId64 "\n", p_offset); + printf("p_length is: %" PRId32 "\n\n", p_length); + + // TODO is it possible to return a memory view pointing to the specific position of the original buffer? + // Python Bytes? Py_BuildValue("y#", buffer+p_offset, p_length), + // Python MemoryView? PyMemoryView_FromMemory(buffer+p_offset, p_length, PyBUF_WRITE), + py_cached_bytes = Py_BuildValue("y#", buffer+p_offset, p_length); + + // Below returns an IonPyLazyObj holding cached bytes, and is equivalence to: + // IonPyLazyObj(py_cached_bytes, t, None); + rtn = PyObject_CallFunctionObjArgs( + _ionpylazyobj_cls, + py_cached_bytes, + py_ion_type_table[ION_TYPE_INT(t) >> 8], + NULL + ); + + ionc_add_to_container(container, rtn, in_struct, NULL); + Py_DECREF(py_cached_bytes); + } + iRETURN; +} + /* * Entry point of read/load functions */ @@ -1510,36 +1577,58 @@ PyObject* ionc_read(PyObject* self, PyObject *args, PyObject *kwds) { iENTER; PyObject *py_file = NULL; // TextIOWrapper PyObject *emit_bare_values; + PyObject *parse_lazily; ionc_read_Iterator *iterator = NULL; - static char *kwlist[] = {"file", "emit_bare_values", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwds, IONC_READ_ARGS_FORMAT, kwlist, &py_file, &emit_bare_values)) { + static char *kwlist[] = {"file", "emit_bare_values", "parse_lazily", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwds, IONC_READ_ARGS_FORMAT, kwlist, &py_file, &emit_bare_values, &parse_lazily)) { FAILWITH(IERR_INVALID_ARG); } - iterator = PyObject_New(ionc_read_Iterator, &ionc_read_IteratorType); - if (!iterator) { - FAILWITH(IERR_INTERNAL_ERROR); - } - Py_INCREF(py_file); + // Store the stream in IonPyObj until it actually needs to be serialized. + if (parse_lazily == Py_True) { + hREADER reader; + char *buffer = NULL; + long size; + PyObject *top_level_container = NULL; + PyString_AsStringAndSize(py_file, &buffer, &size); - if (!PyObject_Init((PyObject*) iterator, &ionc_read_IteratorType)) { - FAILWITH(IERR_INTERNAL_ERROR); - } + // TODO what if size is larger than SIZE ? + ION_READER_OPTIONS options; + memset(&options, 0, sizeof(options)); + options.decimal_context = &dec_context; + options.max_annotation_count = ANNOTATION_MAX_LEN; - iterator->closed = FALSE; - iterator->file_handler_state.py_file = py_file; - iterator->emit_bare_values = emit_bare_values == Py_True; - memset(&iterator->reader, 0, sizeof(iterator->reader)); - memset(&iterator->_reader_options, 0, sizeof(iterator->_reader_options)); - iterator->_reader_options.decimal_context = &dec_context; + IONCHECK(ion_reader_open_buffer(&reader, (BYTE*)buffer, (SIZE)size, &options)); // NULL represents default reader options - IONCHECK(ion_reader_open_stream( - &iterator->reader, - &iterator->file_handler_state, - ion_read_file_stream_handler, - &iterator->_reader_options)); // NULL represents default reader options - return iterator; + top_level_container = PyList_New(0); + IONCHECK(ionc_lazy_read_all(reader, top_level_container, FALSE, emit_bare_values == Py_True, buffer)); + IONCHECK(ion_reader_close(reader)); + return top_level_container; + } else { + iterator = PyObject_New(ionc_read_Iterator, &ionc_read_IteratorType); + if (!iterator) { + FAILWITH(IERR_INTERNAL_ERROR); + } + Py_INCREF(py_file); + + if (!PyObject_Init((PyObject*) iterator, &ionc_read_IteratorType)) { + FAILWITH(IERR_INTERNAL_ERROR); + } + + iterator->closed = FALSE; + iterator->file_handler_state.py_file = py_file; + iterator->emit_bare_values = emit_bare_values == Py_True; + memset(&iterator->reader, 0, sizeof(iterator->reader)); + memset(&iterator->_reader_options, 0, sizeof(iterator->_reader_options)); + iterator->_reader_options.decimal_context = &dec_context; + IONCHECK(ion_reader_open_stream( + &iterator->reader, + &iterator->file_handler_state, + ion_read_file_stream_handler, + &iterator->_reader_options)); // NULL represents default reader options + return iterator; + } fail: if (iterator != NULL) { Py_DECREF(py_file); @@ -1594,8 +1683,11 @@ PyObject* ionc_init_module(void) { _decimal_module = PyImport_ImportModule("decimal"); _decimal_constructor = PyObject_GetAttrString(_decimal_module, "Decimal"); - _simpletypes_module = PyImport_ImportModule("amazon.ion.simple_types"); + _lazytype_module = PyImport_ImportModule("amazon.ion.lazy_type"); + _ionpylazyobj_cls = PyObject_GetAttrString(_lazytype_module, "IonPyLazyObj"); + + _simpletypes_module = PyImport_ImportModule("amazon.ion.simple_types"); _ionpynull_cls = PyObject_GetAttrString(_simpletypes_module, "IonPyNull"); _ionpynull_fromvalue = PyObject_GetAttrString(_ionpynull_cls, "from_value"); _ionpybool_cls = PyObject_GetAttrString(_simpletypes_module, "IonPyBool"); diff --git a/amazon/ion/lazy_type.py b/amazon/ion/lazy_type.py new file mode 100644 index 000000000..d7a05846a --- /dev/null +++ b/amazon/ion/lazy_type.py @@ -0,0 +1,24 @@ +from . import simpleion +from .simple_types import _IonNature, IonPyNull, IonPyList + + +class IonPyLazyObj(_IonNature): + """ + Representation of an IonPyObj that generated by lazily_parse. + IonNature had ion_type already but I put it here for test purpose. + """ + ion_buffer = None + ion_type = None + + def __init__(self, b, t, *args, **kwargs): + super().__init__(*args, **kwargs) + self.lazy_buffer = b + self.lazy_type = t + + # Wake up the lazy object, return a real IonPyObj + # This might be helpful later, but is not used at all for now + def wake_up(self): + if self.lazy_buffer is None: + return IonPyNull() + else: + raise NotImplementedError('No text format support yet') diff --git a/amazon/ion/simple_types.py b/amazon/ion/simple_types.py index 1e7d9e57d..cf3dc2e84 100644 --- a/amazon/ion/simple_types.py +++ b/amazon/ion/simple_types.py @@ -28,6 +28,7 @@ # in Python 3.10, abstract collections have moved into their own module # for compatibility with 3.10+, first try imports from the new location # if that fails, try from the pre-3.10 location + try: from collections.abc import MutableMapping except: diff --git a/amazon/ion/simpleion.py b/amazon/ion/simpleion.py index 2d80e8978..aadcbf9ff 100644 --- a/amazon/ion/simpleion.py +++ b/amazon/ion/simpleion.py @@ -31,6 +31,7 @@ from amazon.ion.writer_text import text_writer from .core import IonEvent, IonEventType, IonType, ION_STREAM_END_EVENT, Timestamp, ION_VERSION_MARKER_EVENT from .exceptions import IonException +from .lazy_type import IonPyLazyObj from .reader import blocking_reader, NEXT_EVENT from .reader_binary import binary_reader from .reader_managed import managed_reader @@ -450,7 +451,8 @@ def add(obj): def loads(ion_str, catalog=None, single_value=True, encoding='utf-8', cls=None, object_hook=None, parse_float=None, - parse_int=None, parse_constant=None, object_pairs_hook=None, use_decimal=None, parse_eagerly=True, **kw): + parse_int=None, parse_constant=None, object_pairs_hook=None, use_decimal=None, parse_eagerly=True, + parse_lazily=False, **kw): """Deserialize ``ion_str``, which is a string representation of an Ion object, to a Python object using the conversion table used by load (above). @@ -489,7 +491,8 @@ def loads(ion_str, catalog=None, single_value=True, encoding='utf-8', cls=None, return load(ion_buffer, catalog=catalog, single_value=single_value, encoding=encoding, cls=cls, object_hook=object_hook, parse_float=parse_float, parse_int=parse_int, parse_constant=parse_constant, - object_pairs_hook=object_pairs_hook, use_decimal=use_decimal, parse_eagerly=parse_eagerly) + object_pairs_hook=object_pairs_hook, use_decimal=use_decimal, parse_eagerly=parse_eagerly, + parse_lazily=parse_lazily) def dump_extension(obj, fp, binary=True, sequence_as_stream=False, tuple_as_sexp=False, omit_version_marker=False): @@ -501,8 +504,14 @@ def dump_extension(obj, fp, binary=True, sequence_as_stream=False, tuple_as_sexp fp.write(res) -def load_extension(fp, single_value=True, parse_eagerly=True): - iterator = ionc.ionc_read(fp, emit_bare_values=False) +def load_extension(fp, single_value=True, parse_eagerly=True, parse_lazily=False): + # For easier test, ignore parse_eagerly when parse_lazily is set to True. + if parse_lazily and isinstance(fp, BytesIO): + data = fp.read() + fp.close() + return ionc.ionc_read(data, emit_bare_values=False, parse_lazily=True) + + iterator = ionc.ionc_read(fp, emit_bare_values=False, parse_lazily=False) if single_value: try: value = next(iterator) @@ -539,9 +548,10 @@ def dump(obj, fp, imports=None, binary=True, sequence_as_stream=False, skipkeys= def load(fp, catalog=None, single_value=True, encoding='utf-8', cls=None, object_hook=None, parse_float=None, - parse_int=None, parse_constant=None, object_pairs_hook=None, use_decimal=None, parse_eagerly=True, **kw): + parse_int=None, parse_constant=None, object_pairs_hook=None, use_decimal=None, parse_eagerly=True, + parse_lazily=False, **kw): if c_ext and catalog is None: - return load_extension(fp, parse_eagerly=parse_eagerly, single_value=single_value) + return load_extension(fp, parse_eagerly=parse_eagerly, parse_lazily=parse_lazily, single_value=single_value) else: return load_python(fp, catalog=catalog, single_value=single_value, encoding=encoding, cls=cls, object_hook=object_hook, parse_float=parse_float, parse_int=parse_int, diff --git a/amazon/ion/test.py b/amazon/ion/test.py new file mode 100644 index 000000000..bb6d1139d --- /dev/null +++ b/amazon/ion/test.py @@ -0,0 +1,19 @@ +import amazon.ion.simpleion as ion + +# Test data, its text representation is: ```[1, 2] [3]``` +# Usually, the original C extension loads below bytes into a top-level list holding below values E.g. [[1,2], [3]] +ion_binary_bytes = b'\xe0\x01\x00\xea\xb4\x21\x01\x21\x02\xb2\x21\x03' + +# This should return a list below: +# obj = [, ] +obj = ion.loads(ion_binary_bytes, parse_lazily=True) +# The first lazy object holds bytes \xb4\x21\x01\x21\x02 <- [1,2] +print(f'obj[0].lazy_buffer is {obj[0].lazy_buffer}') +# The second lazy object holds bytes \xb2\x21\x03 <- [3] +print(f'obj[1].lazy_buffer is {obj[1].lazy_buffer}') + +# The returned bytes is wrong because I wrote a blob instead of cached bytes, the returned bytes are: +# Bytes returned \xe0\x01\x00\xea \xba \xa5 \xb4 \x21\x01 \x21\x02 \xa3 \xb2 \x21\x03 +# Text representation IVM [ blob([ 1, 2 ]) blob([ 3 ])] +# We should take out the blob wrapper later. +print(ion.dumps(obj))