Skip to content

Commit e5ca3fc

Browse files
ivandaschisapego
authored andcommitted
IGNITE-14186 Implement C module to speedup hashcode
This closes #17
1 parent 7743b23 commit e5ca3fc

14 files changed

Lines changed: 661 additions & 65 deletions

File tree

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
.idea
2+
.benchmarks
23
.vscode
34
.eggs
45
.pytest_cache
56
.tox
7+
*.so
8+
build
9+
distr
610
tests/config/*.xml
711
junit*.xml
812
pyignite.egg-info

MANIFEST.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
recursive-include requirements *
2+
include README.md

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,19 @@ $ pip install -r requirements/<your task>.txt
3636

3737
You may also want to consult the `setuptools` manual about using `setup.py`.
3838

39+
### *optional C extension*
40+
There is an optional C extension to speedup some computational intensive tasks. If it's compilation fails
41+
(missing compiler or CPython headers), `pyignite` will be installed without this module.
42+
43+
- On Linux or MacOS X only C compiler is required (`gcc` or `clang`). It compiles during standard setup process.
44+
- For building universal `wheels` (binary packages) for Linux, just invoke script `./scripts/create_distr.sh`.
45+
46+
***NB!* Docker is required.**
47+
48+
Ready wheels for `x86` and `x86-64` for different python versions (3.6, 3.7, 3.8 and 3.9) will be
49+
located in `./distr` directory.
50+
51+
3952
### Updating from older version
4053

4154
To upgrade an existing package, use the following command:

cext/cutils.c

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
#include <Python.h>
19+
20+
#ifdef _MSC_VER
21+
22+
typedef __int32 int32_t;
23+
typedef unsigned __int32 uint32_t;
24+
typedef __int64 int64_t;
25+
typedef unsigned __int64 uint64_t;
26+
27+
#else
28+
#include <stdint.h>
29+
#endif
30+
31+
static int32_t FNV1_OFFSET_BASIS = 0x811c9dc5;
32+
static int32_t FNV1_PRIME = 0x01000193;
33+
34+
35+
PyObject* hashcode(PyObject* self, PyObject *args);
36+
PyObject* schema_id(PyObject* self, PyObject *args);
37+
38+
PyObject* str_hashcode(PyObject* data);
39+
int32_t str_hashcode_(PyObject* data, int lower);
40+
PyObject* b_hashcode(PyObject* data);
41+
42+
static PyMethodDef methods[] = {
43+
{"hashcode", (PyCFunction) hashcode, METH_VARARGS, ""},
44+
{"schema_id", (PyCFunction) schema_id, METH_VARARGS, ""},
45+
{NULL, NULL, 0, NULL} /* Sentinel */
46+
};
47+
48+
static struct PyModuleDef moduledef = {
49+
PyModuleDef_HEAD_INIT,
50+
"_cutils",
51+
0, /* m_doc */
52+
-1, /* m_size */
53+
methods, /* m_methods */
54+
NULL, /* m_slots */
55+
NULL, /* m_traverse */
56+
NULL, /* m_clear */
57+
NULL, /* m_free */
58+
};
59+
60+
static char* hashcode_input_err = "supported only strings, bytearrays, bytes and memoryview";
61+
static char* schema_id_input_err = "input argument must be dict or int";
62+
static char* schema_field_type_err = "schema keys must be strings";
63+
64+
PyMODINIT_FUNC PyInit__cutils(void) {
65+
return PyModule_Create(&moduledef);
66+
}
67+
68+
PyObject* hashcode(PyObject* self, PyObject *args) {
69+
PyObject* data;
70+
71+
if (!PyArg_ParseTuple(args, "O", &data)) {
72+
return NULL;
73+
}
74+
75+
if (data == Py_None) {
76+
return PyLong_FromLong(0);
77+
}
78+
else if (PyUnicode_CheckExact(data)) {
79+
return str_hashcode(data);
80+
}
81+
else {
82+
return b_hashcode(data);
83+
}
84+
}
85+
86+
PyObject* str_hashcode(PyObject* data) {
87+
return PyLong_FromLong(str_hashcode_(data, 0));
88+
}
89+
90+
int32_t str_hashcode_(PyObject *str, int lower) {
91+
int32_t res = 0;
92+
93+
Py_ssize_t sz = PyUnicode_GET_LENGTH(str);
94+
if (!sz) {
95+
return res;
96+
}
97+
98+
int kind = PyUnicode_KIND(str);
99+
void* buf = PyUnicode_DATA(str);
100+
101+
Py_ssize_t i;
102+
for (i = 0; i < sz; i++) {
103+
Py_UCS4 ch = PyUnicode_READ(kind, buf, i);
104+
105+
if (lower) {
106+
ch = Py_UNICODE_TOLOWER(ch);
107+
}
108+
109+
res = 31 * res + ch;
110+
}
111+
112+
return res;
113+
}
114+
115+
PyObject* b_hashcode(PyObject* data) {
116+
int32_t res = 1;
117+
Py_ssize_t sz; char* buf;
118+
119+
if (PyBytes_CheckExact(data)) {
120+
sz = PyBytes_GET_SIZE(data);
121+
buf = PyBytes_AS_STRING(data);
122+
}
123+
else if (PyByteArray_CheckExact(data)) {
124+
sz = PyByteArray_GET_SIZE(data);
125+
buf = PyByteArray_AS_STRING(data);
126+
}
127+
else if (PyMemoryView_Check(data)) {
128+
Py_buffer* pyBuf = PyMemoryView_GET_BUFFER(data);
129+
sz = pyBuf->len;
130+
buf = (char*)pyBuf->buf;
131+
}
132+
else {
133+
PyErr_SetString(PyExc_ValueError, hashcode_input_err);
134+
return NULL;
135+
}
136+
137+
Py_ssize_t i;
138+
for (i = 0; i < sz; i++) {
139+
res = 31 * res + (signed char)buf[i];
140+
}
141+
142+
return PyLong_FromLong(res);
143+
}
144+
145+
PyObject* schema_id(PyObject* self, PyObject *args) {
146+
PyObject* data;
147+
148+
if (!PyArg_ParseTuple(args, "O", &data)) {
149+
return NULL;
150+
}
151+
152+
if (PyLong_CheckExact(data)) {
153+
return PyNumber_Long(data);
154+
}
155+
else if (data == Py_None) {
156+
return PyLong_FromLong(0);
157+
}
158+
else if (PyDict_Check(data)) {
159+
Py_ssize_t sz = PyDict_Size(data);
160+
161+
if (sz == 0) {
162+
return PyLong_FromLong(0);
163+
}
164+
165+
int32_t s_id = FNV1_OFFSET_BASIS;
166+
167+
PyObject *key, *value;
168+
Py_ssize_t pos = 0;
169+
170+
while (PyDict_Next(data, &pos, &key, &value)) {
171+
if (!PyUnicode_CheckExact(key)) {
172+
PyErr_SetString(PyExc_ValueError, schema_field_type_err);
173+
return NULL;
174+
}
175+
176+
int32_t field_id = str_hashcode_(key, 1);
177+
s_id ^= field_id & 0xff;
178+
s_id *= FNV1_PRIME;
179+
s_id ^= (field_id >> 8) & 0xff;
180+
s_id *= FNV1_PRIME;
181+
s_id ^= (field_id >> 16) & 0xff;
182+
s_id *= FNV1_PRIME;
183+
s_id ^= (field_id >> 24) & 0xff;
184+
s_id *= FNV1_PRIME;
185+
}
186+
187+
return PyLong_FromLong(s_id);
188+
}
189+
else {
190+
PyErr_SetString(PyExc_ValueError, schema_id_input_err);
191+
return NULL;
192+
}
193+
}

pyignite/api/binary.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from pyignite.datatypes import String, Int, Bool
2323
from pyignite.queries import Query
2424
from pyignite.queries.op_codes import *
25-
from pyignite.utils import int_overflow, entity_id
25+
from pyignite.utils import entity_id, schema_id
2626
from .result import APIResult
2727
from ..stream import BinaryStream, READ_BACKWARD
2828
from ..queries.response import Response
@@ -137,7 +137,7 @@ def put_binary_type(
137137
'is_enum': is_enum,
138138
'schema': [],
139139
}
140-
schema_id = None
140+
s_id = None
141141
if is_enum:
142142
data['enums'] = []
143143
for literal, ordinal in schema.items():
@@ -147,7 +147,7 @@ def put_binary_type(
147147
})
148148
else:
149149
# assemble schema and calculate schema ID in one go
150-
schema_id = FNV1_OFFSET_BASIS if schema else 0
150+
s_id = schema_id(schema)
151151
for field_name, data_type in schema.items():
152152
# TODO: check for allowed data types
153153
field_id = entity_id(field_name)
@@ -159,17 +159,9 @@ def put_binary_type(
159159
),
160160
'field_id': field_id,
161161
})
162-
schema_id ^= (field_id & 0xff)
163-
schema_id = int_overflow(schema_id * FNV1_PRIME)
164-
schema_id ^= ((field_id >> 8) & 0xff)
165-
schema_id = int_overflow(schema_id * FNV1_PRIME)
166-
schema_id ^= ((field_id >> 16) & 0xff)
167-
schema_id = int_overflow(schema_id * FNV1_PRIME)
168-
schema_id ^= ((field_id >> 24) & 0xff)
169-
schema_id = int_overflow(schema_id * FNV1_PRIME)
170162

171163
data['schema'].append({
172-
'schema_id': schema_id,
164+
'schema_id': s_id,
173165
'schema_fields': [
174166
{'schema_field_id': entity_id(x)} for x in schema
175167
],

pyignite/utils.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@
2323
from pyignite.datatypes.base import IgniteDataType
2424
from .constants import *
2525

26+
FALLBACK = False
27+
28+
try:
29+
from pyignite import _cutils
30+
except ImportError:
31+
FALLBACK = True
32+
2633

2734
LONG_MASK = 0xffffffff
2835
DIGITS_PER_INT = 9
@@ -91,6 +98,13 @@ def hashcode(data: Union[str, bytes, bytearray, memoryview]) -> int:
9198
:param data: UTF-8-encoded string identifier of binary buffer or byte array
9299
:return: hash code.
93100
"""
101+
if FALLBACK:
102+
return __hashcode_fallback(data)
103+
104+
return _cutils.hashcode(data)
105+
106+
107+
def __hashcode_fallback(data: Union[str, bytes, bytearray, memoryview]) -> int:
94108
if isinstance(data, str):
95109
"""
96110
For strings we iterate over code point which are of the int type
@@ -147,13 +161,21 @@ def schema_id(schema: Union[int, dict]) -> int:
147161
:param schema: a dict of field names: field types,
148162
:return: schema ID.
149163
"""
150-
if type(schema) is int:
164+
if FALLBACK:
165+
return __schema_id_fallback(schema)
166+
return _cutils.schema_id(schema)
167+
168+
169+
def __schema_id_fallback(schema: Union[int, dict]) -> int:
170+
if isinstance(schema, int):
151171
return schema
172+
152173
if schema is None:
153174
return 0
175+
154176
s_id = FNV1_OFFSET_BASIS if schema else 0
155177
for field_name in schema.keys():
156-
field_id = entity_id(field_name)
178+
field_id = __hashcode_fallback(field_name.lower())
157179
s_id ^= (field_id & 0xff)
158180
s_id = int_overflow(s_id * FNV1_PRIME)
159181
s_id ^= ((field_id >> 8) & 0xff)

requirements/install.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
# these pip packages are necessary for the pyignite to run
22

3-
typing==3.6.6; python_version<'3.5'
43
attrs==18.1.0

scripts/build_wheels.sh

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/bin/bash
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
set -e -u -x
18+
19+
function repair_wheel {
20+
wheel="$1"
21+
if ! auditwheel show "$wheel"; then
22+
echo "Skipping non-platform wheel $wheel"
23+
else
24+
auditwheel repair "$wheel" --plat "$PLAT" -w /wheels
25+
fi
26+
}
27+
28+
# Compile wheels
29+
for PYBIN in /opt/python/*/bin; do
30+
if [[ $PYBIN =~ ^(.*)cp3[6789](.*)$ ]]; then
31+
"${PYBIN}/pip" wheel /pyignite/ --no-deps -w /wheels
32+
fi
33+
done
34+
35+
# Bundle external shared libraries into the wheels
36+
for whl in /wheels/*.whl; do
37+
repair_wheel "$whl"
38+
done
39+
40+
for whl in /wheels/*.whl; do
41+
if [[ ! $whl =~ ^(.*)manylinux(.*)$ ]]; then
42+
rm "$whl"
43+
else
44+
chmod 666 "$whl"
45+
fi
46+
done
47+
48+
rm -rf /pyignite/*.egg-info
49+
rm -rf /pyignite/.eggs

0 commit comments

Comments
 (0)