-
Notifications
You must be signed in to change notification settings - Fork 0
Implements a lazily_parse option for simpleIon load/dump. #30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,4 +36,5 @@ | |
| 'writer_binary_raw_fields', | ||
| 'writer_buffer', | ||
| 'writer_text', | ||
| 'lazy_type', | ||
| ] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,7 +30,7 @@ static char _err_msg[ERR_MSG_MAX_LEN]; | |
| // Python 2/3 compatibility | ||
| #if PY_MAJOR_VERSION >= 3 | ||
| #define IONC_BYTES_FORMAT "y#" | ||
| #define IONC_READ_ARGS_FORMAT "OO" | ||
| #define IONC_READ_ARGS_FORMAT "OOO" | ||
| #define PyInt_AsSsize_t PyLong_AsSsize_t | ||
| #define PyInt_AsLong PyLong_AsLong | ||
| #define PyInt_FromLong PyLong_FromLong | ||
|
|
@@ -41,7 +41,7 @@ static char _err_msg[ERR_MSG_MAX_LEN]; | |
| #define PyInt_Check PyLong_Check | ||
| #else | ||
| #define IONC_BYTES_FORMAT "s#" | ||
| #define IONC_READ_ARGS_FORMAT "OOO" | ||
| #define IONC_READ_ARGS_FORMAT "OOOO" | ||
| #endif | ||
|
|
||
| #if PY_VERSION_HEX < 0x02070000 | ||
|
|
@@ -53,6 +53,8 @@ static PyObject* _math_module; | |
| static PyObject* _decimal_module; | ||
| static PyObject* _decimal_constructor; | ||
| static PyObject* _py_timestamp_constructor; | ||
| static PyObject* _lazytype_module; | ||
| static PyObject* _ionpylazyobj_cls; | ||
| static PyObject* _simpletypes_module; | ||
| static PyObject* _ionpynull_cls; | ||
| static PyObject* _ionpynull_fromvalue; | ||
|
|
@@ -792,6 +794,28 @@ iERR ionc_write_value(hWRITER writer, PyObject* obj, PyObject* tuple_as_sexp) { | |
| IONCHECK(ionc_write_sequence(writer, obj, tuple_as_sexp)); | ||
| IONCHECK(ion_writer_finish_container(writer)); | ||
| } | ||
| /* When we are going to serialize a lazy Ion object, we should check if this object holds a cached binary | ||
| representation already. If so, write the cached bytes into writer's temporary buffer directly. However I didn't find | ||
| an API in ion-c's ion_writer.h to append arbitrary bytes in writer's temporary buffer (correct me if I missed it) so | ||
| I write an IonBlob value instead, which is wrong! It's only used for visible and debugging. Later on we should decide | ||
| if/how to expose the existing API in ion-c for CPython layer so that it's able to append custom bytes into writer's | ||
| value_stream instead of only be able to call the writer to writes bytes. | ||
| */ | ||
| else if (PyObject_IsInstance(obj, _ionpylazyobj_cls)) { | ||
| Py_ssize_t len; | ||
| char* c_buf; | ||
|
|
||
| // TODO check the existence of lazy buffer first. Since all lazy objects hold a buffer by my sample code... I skipped validation | ||
| PyObject* lazy_buffer = PyObject_GetAttrString(obj, "lazy_buffer"); | ||
|
|
||
| // Convert python bytes to C bytes | ||
| if (PyBytes_AsStringAndSize(lazy_buffer, &c_buf, &len) < 0) { | ||
| _FAILWITHMSG(IERR_INVALID_ARG, "Binary conversion error."); | ||
| } | ||
| // TODO should append the c_buf to _value_stream directly. Not an Ion BLOB! | ||
| // E.g. Something like ION_PUT(writer->_typed_writer.binary._value_stream, c_buf); | ||
| IONCHECK(ion_writer_write_blob(writer, (BYTE*)c_buf, len)); | ||
| } | ||
| else { | ||
| _FAILWITHMSG(IERR_INVALID_STATE, "Cannot dump arbitrary object types."); | ||
| } | ||
|
|
@@ -1503,43 +1527,108 @@ void ionc_read_iter_dealloc(PyObject *self) { | |
| PyObject_Del(self); | ||
| } | ||
|
|
||
| iERR ionc_lazy_read_all(hREADER hreader, PyObject* container, BOOL in_struct, BOOL emit_bare_values, char* buffer) { | ||
| iENTER; | ||
| ION_TYPE t; | ||
| for (;;) { | ||
| IONCHECK(ion_reader_next(hreader, &t)); | ||
| if (t == tid_EOF) { | ||
| assert(t == tid_EOF && "next() at end"); | ||
| break; | ||
| } | ||
| PyObject* rtn; | ||
| PyObject* py_cached_bytes; | ||
|
|
||
| // Calculate the start position and the length of the value that writer is stand on so we can cache this bytes | ||
| // Start position | ||
| POSITION p_offset = 0; | ||
| // Value length | ||
| SIZE p_length = 0; | ||
| ion_reader_get_value_offset(hreader, &p_offset); | ||
| ion_reader_get_value_length(hreader, &p_length); | ||
|
Comment on lines
+1547
to
+1548
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice, I forgot ion-c provided this functionality. It might not be well-tested, however... That is worth looking into. We want to make sure it's accurate in all cases.
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, will leave this comment unsolved first. |
||
| // debug | ||
| printf("p_offset is: %" PRId64 "\n", p_offset); | ||
| printf("p_length is: %" PRId32 "\n\n", p_length); | ||
|
|
||
| // TODO is it possible to return a memory view pointing to the specific position of the original buffer? | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
| // Python Bytes? Py_BuildValue("y#", buffer+p_offset, p_length), | ||
| // Python MemoryView? PyMemoryView_FromMemory(buffer+p_offset, p_length, PyBUF_WRITE), | ||
| py_cached_bytes = Py_BuildValue("y#", buffer+p_offset, p_length); | ||
|
|
||
| // Below returns an IonPyLazyObj holding cached bytes, and is equivalence to: | ||
| // IonPyLazyObj(py_cached_bytes, t, None); | ||
| rtn = PyObject_CallFunctionObjArgs( | ||
| _ionpylazyobj_cls, | ||
| py_cached_bytes, | ||
| py_ion_type_table[ION_TYPE_INT(t) >> 8], | ||
| NULL | ||
| ); | ||
|
|
||
| ionc_add_to_container(container, rtn, in_struct, NULL); | ||
| Py_DECREF(py_cached_bytes); | ||
| } | ||
| iRETURN; | ||
| } | ||
|
|
||
| /* | ||
| * Entry point of read/load functions | ||
| */ | ||
| PyObject* ionc_read(PyObject* self, PyObject *args, PyObject *kwds) { | ||
| iENTER; | ||
| PyObject *py_file = NULL; // TextIOWrapper | ||
| PyObject *emit_bare_values; | ||
| PyObject *parse_lazily; | ||
| ionc_read_Iterator *iterator = NULL; | ||
| static char *kwlist[] = {"file", "emit_bare_values", NULL}; | ||
| if (!PyArg_ParseTupleAndKeywords(args, kwds, IONC_READ_ARGS_FORMAT, kwlist, &py_file, &emit_bare_values)) { | ||
| static char *kwlist[] = {"file", "emit_bare_values", "parse_lazily", NULL}; | ||
| if (!PyArg_ParseTupleAndKeywords(args, kwds, IONC_READ_ARGS_FORMAT, kwlist, &py_file, &emit_bare_values, &parse_lazily)) { | ||
| FAILWITH(IERR_INVALID_ARG); | ||
| } | ||
|
|
||
| iterator = PyObject_New(ionc_read_Iterator, &ionc_read_IteratorType); | ||
| if (!iterator) { | ||
| FAILWITH(IERR_INTERNAL_ERROR); | ||
| } | ||
| Py_INCREF(py_file); | ||
| // Store the stream in IonPyObj until it actually needs to be serialized. | ||
| if (parse_lazily == Py_True) { | ||
| hREADER reader; | ||
| char *buffer = NULL; | ||
| long size; | ||
| PyObject *top_level_container = NULL; | ||
| PyString_AsStringAndSize(py_file, &buffer, &size); | ||
|
|
||
| if (!PyObject_Init((PyObject*) iterator, &ionc_read_IteratorType)) { | ||
| FAILWITH(IERR_INTERNAL_ERROR); | ||
| } | ||
| // TODO what if size is larger than SIZE ? | ||
| ION_READER_OPTIONS options; | ||
| memset(&options, 0, sizeof(options)); | ||
| options.decimal_context = &dec_context; | ||
| options.max_annotation_count = ANNOTATION_MAX_LEN; | ||
|
|
||
| iterator->closed = FALSE; | ||
| iterator->file_handler_state.py_file = py_file; | ||
| iterator->emit_bare_values = emit_bare_values == Py_True; | ||
| memset(&iterator->reader, 0, sizeof(iterator->reader)); | ||
| memset(&iterator->_reader_options, 0, sizeof(iterator->_reader_options)); | ||
| iterator->_reader_options.decimal_context = &dec_context; | ||
| IONCHECK(ion_reader_open_buffer(&reader, (BYTE*)buffer, (SIZE)size, &options)); // NULL represents default reader options | ||
|
|
||
| IONCHECK(ion_reader_open_stream( | ||
| &iterator->reader, | ||
| &iterator->file_handler_state, | ||
| ion_read_file_stream_handler, | ||
| &iterator->_reader_options)); // NULL represents default reader options | ||
| return iterator; | ||
| top_level_container = PyList_New(0); | ||
| IONCHECK(ionc_lazy_read_all(reader, top_level_container, FALSE, emit_bare_values == Py_True, buffer)); | ||
| IONCHECK(ion_reader_close(reader)); | ||
| return top_level_container; | ||
| } else { | ||
| iterator = PyObject_New(ionc_read_Iterator, &ionc_read_IteratorType); | ||
| if (!iterator) { | ||
| FAILWITH(IERR_INTERNAL_ERROR); | ||
| } | ||
| Py_INCREF(py_file); | ||
|
|
||
| if (!PyObject_Init((PyObject*) iterator, &ionc_read_IteratorType)) { | ||
| FAILWITH(IERR_INTERNAL_ERROR); | ||
| } | ||
|
|
||
| iterator->closed = FALSE; | ||
| iterator->file_handler_state.py_file = py_file; | ||
| iterator->emit_bare_values = emit_bare_values == Py_True; | ||
| memset(&iterator->reader, 0, sizeof(iterator->reader)); | ||
| memset(&iterator->_reader_options, 0, sizeof(iterator->_reader_options)); | ||
| iterator->_reader_options.decimal_context = &dec_context; | ||
|
|
||
| IONCHECK(ion_reader_open_stream( | ||
| &iterator->reader, | ||
| &iterator->file_handler_state, | ||
| ion_read_file_stream_handler, | ||
| &iterator->_reader_options)); // NULL represents default reader options | ||
| return iterator; | ||
| } | ||
| fail: | ||
| if (iterator != NULL) { | ||
| Py_DECREF(py_file); | ||
|
|
@@ -1594,8 +1683,11 @@ PyObject* ionc_init_module(void) { | |
|
|
||
| _decimal_module = PyImport_ImportModule("decimal"); | ||
| _decimal_constructor = PyObject_GetAttrString(_decimal_module, "Decimal"); | ||
| _simpletypes_module = PyImport_ImportModule("amazon.ion.simple_types"); | ||
|
|
||
| _lazytype_module = PyImport_ImportModule("amazon.ion.lazy_type"); | ||
| _ionpylazyobj_cls = PyObject_GetAttrString(_lazytype_module, "IonPyLazyObj"); | ||
|
|
||
| _simpletypes_module = PyImport_ImportModule("amazon.ion.simple_types"); | ||
| _ionpynull_cls = PyObject_GetAttrString(_simpletypes_module, "IonPyNull"); | ||
| _ionpynull_fromvalue = PyObject_GetAttrString(_ionpynull_cls, "from_value"); | ||
| _ionpybool_cls = PyObject_GetAttrString(_simpletypes_module, "IonPyBool"); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| from . import simpleion | ||
| from .simple_types import _IonNature, IonPyNull, IonPyList | ||
|
|
||
|
|
||
| class IonPyLazyObj(_IonNature): | ||
| """ | ||
| Representation of an IonPyObj that generated by lazily_parse. | ||
| IonNature had ion_type already but I put it here for test purpose. | ||
| """ | ||
| ion_buffer = None | ||
| ion_type = None | ||
|
|
||
| def __init__(self, b, t, *args, **kwargs): | ||
| super().__init__(*args, **kwargs) | ||
| self.lazy_buffer = b | ||
| self.lazy_type = t | ||
|
|
||
| # Wake up the lazy object, return a real IonPyObj | ||
| # This might be helpful later, but is not used at all for now | ||
| def wake_up(self): | ||
| if self.lazy_buffer is None: | ||
| return IonPyNull() | ||
| else: | ||
| raise NotImplementedError('No text format support yet') |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| import amazon.ion.simpleion as ion | ||
|
|
||
| # Test data, its text representation is: ```[1, 2] [3]``` | ||
| # Usually, the original C extension loads below bytes into a top-level list holding below values E.g. [[1,2], [3]] | ||
| ion_binary_bytes = b'\xe0\x01\x00\xea\xb4\x21\x01\x21\x02\xb2\x21\x03' | ||
|
|
||
| # This should return a list below: | ||
| # obj = [<IonPyLazyObj holding [1,2]>, <IonPyLazyObj holding [3]>] | ||
| obj = ion.loads(ion_binary_bytes, parse_lazily=True) | ||
| # The first lazy object holds bytes \xb4\x21\x01\x21\x02 <- [1,2] | ||
| print(f'obj[0].lazy_buffer is {obj[0].lazy_buffer}') | ||
| # The second lazy object holds bytes \xb2\x21\x03 <- [3] | ||
| print(f'obj[1].lazy_buffer is {obj[1].lazy_buffer}') | ||
|
|
||
| # The returned bytes is wrong because I wrote a blob instead of cached bytes, the returned bytes are: | ||
| # Bytes returned \xe0\x01\x00\xea \xba \xa5 \xb4 \x21\x01 \x21\x02 \xa3 \xb2 \x21\x03 | ||
| # Text representation IVM [ blob([ 1, 2 ]) blob([ 3 ])] | ||
| # We should take out the blob wrapper later. | ||
| print(ion.dumps(obj)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a good callout. We need to decide the best way to expose this functionality in ion-c. It's probably not an API we'd want to add to the public
ion_writer.hheader.I wonder if we can just flush the writer before writing raw bytes, and then write those bytes directly to the underlying
ION_STREAM...? If we can do that it may help us avoid having to add to the interface.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, it works. But I need to investigate why ion_writer_flush() didn't work in my sample code so the output is not in a right order...
Current logic parses a list Lazy object to an Ion list including child lazy objects, we don't know the total length when we append the cached bytes in the output ion_stream so it requires a little more calculation.
To know how to calculate the length, the question then turns to how we handle a lazy list. My idea is:
ION_TYPE.LISTlike other Ion types.partially_deserializethe Lazy object into a list holding all child lazy objects in order. They can add new Ion values or delete existing values.fully_deserializethe lazy object into a regular Ion list class holding all deserialized Ion values. They can add, delete or modify values.The second point above partially deserialize the list so we can reuse the cached binary of its child lazy objects. In other words, user just need to deserialize desired Ion values instead of a full list. Later on if users want to modify a specific child lazy object within the List/Sexp, they can deserialize that lazy object, modify it, and replaced the original lazy object with the modified Ion value. This sounds doable but still need a little more experiment, if partial deserialization and the strategy of modifying lazy objects is faster (and correct too), there is no need to do a fully deserialization. In addition we should consider Ion struct situation since it is unordered and allows duplicated keys.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Figured out, I forgot to call it in the top-level.