Skip to content

Commit e62fbb9

Browse files
isapegoivandasch
andauthored
GG-32909 IGNITE-14186 Implement C module to speedup hashcode (#28)
(cherry picked from commit e5ca3fc) Co-authored-by: Ivan Dashchinskiy <ivandasch@gmail.com>
1 parent 8e23a7b commit e62fbb9

13 files changed

Lines changed: 668 additions & 64 deletions

File tree

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
.idea
2+
.benchmarks
23
.vscode
34
.eggs
45
.pytest_cache
56
.tox
7+
*.so
8+
build
9+
distr
610
tests/config/*.xml
711
junit*.xml
812
pygridgain.egg-info

MANIFEST.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
recursive-include requirements *
2+
include README.md

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,19 @@ $ pip install -r requirements/<your task>.txt
4141

4242
You may also want to consult the `setuptools` manual about using `setup.py`.
4343

44+
### *optional C extension*
45+
There is an optional C extension to speedup some computational intensive tasks. If it's compilation fails
46+
(missing compiler or CPython headers), `pygridgain` will be installed without this module.
47+
48+
- On Linux or MacOS X only C compiler is required (`gcc` or `clang`). It compiles during standard setup process.
49+
- For building universal `wheels` (binary packages) for Linux, just invoke script `./scripts/create_distr.sh`.
50+
51+
***NB!* Docker is required.**
52+
53+
Ready wheels for `x86` and `x86-64` for different python versions (3.6, 3.7, 3.8 and 3.9) will be
54+
located in `./distr` directory.
55+
56+
4457
### Updating from older version
4558

4659
To upgrade an existing package, use the following command:

cext/cutils.c

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
/*
2+
* Copyright 2019 GridGain Systems, Inc. and Contributors.
3+
*
4+
* Licensed under the GridGain Community Edition License (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.gridgain.com/products/software/community-edition/gridgain-community-edition-license
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <Python.h>
18+
19+
#ifdef _MSC_VER
20+
21+
typedef __int32 int32_t;
22+
typedef unsigned __int32 uint32_t;
23+
typedef __int64 int64_t;
24+
typedef unsigned __int64 uint64_t;
25+
26+
#else
27+
#include <stdint.h>
28+
#endif
29+
30+
static int32_t FNV1_OFFSET_BASIS = 0x811c9dc5;
31+
static int32_t FNV1_PRIME = 0x01000193;
32+
33+
34+
PyObject* hashcode(PyObject* self, PyObject *args);
35+
PyObject* schema_id(PyObject* self, PyObject *args);
36+
37+
PyObject* str_hashcode(PyObject* data);
38+
int32_t str_hashcode_(PyObject* data, int lower);
39+
PyObject* b_hashcode(PyObject* data);
40+
41+
static PyMethodDef methods[] = {
42+
{"hashcode", (PyCFunction) hashcode, METH_VARARGS, ""},
43+
{"schema_id", (PyCFunction) schema_id, METH_VARARGS, ""},
44+
{NULL, NULL, 0, NULL} /* Sentinel */
45+
};
46+
47+
static struct PyModuleDef moduledef = {
48+
PyModuleDef_HEAD_INIT,
49+
"_cutils",
50+
0, /* m_doc */
51+
-1, /* m_size */
52+
methods, /* m_methods */
53+
NULL, /* m_slots */
54+
NULL, /* m_traverse */
55+
NULL, /* m_clear */
56+
NULL, /* m_free */
57+
};
58+
59+
static char* hashcode_input_err = "supported only strings, bytearrays, bytes and memoryview";
60+
static char* schema_id_input_err = "input argument must be dict or int";
61+
static char* schema_field_type_err = "schema keys must be strings";
62+
63+
PyMODINIT_FUNC PyInit__cutils(void) {
64+
return PyModule_Create(&moduledef);
65+
}
66+
67+
PyObject* hashcode(PyObject* self, PyObject *args) {
68+
PyObject* data;
69+
70+
if (!PyArg_ParseTuple(args, "O", &data)) {
71+
return NULL;
72+
}
73+
74+
if (data == Py_None) {
75+
return PyLong_FromLong(0);
76+
}
77+
else if (PyUnicode_CheckExact(data)) {
78+
return str_hashcode(data);
79+
}
80+
else {
81+
return b_hashcode(data);
82+
}
83+
}
84+
85+
PyObject* str_hashcode(PyObject* data) {
86+
return PyLong_FromLong(str_hashcode_(data, 0));
87+
}
88+
89+
int32_t str_hashcode_(PyObject *str, int lower) {
90+
int32_t res = 0;
91+
92+
Py_ssize_t sz = PyUnicode_GET_LENGTH(str);
93+
if (!sz) {
94+
return res;
95+
}
96+
97+
int kind = PyUnicode_KIND(str);
98+
void* buf = PyUnicode_DATA(str);
99+
100+
Py_ssize_t i;
101+
for (i = 0; i < sz; i++) {
102+
Py_UCS4 ch = PyUnicode_READ(kind, buf, i);
103+
104+
if (lower) {
105+
ch = Py_UNICODE_TOLOWER(ch);
106+
}
107+
108+
res = 31 * res + ch;
109+
}
110+
111+
return res;
112+
}
113+
114+
PyObject* b_hashcode(PyObject* data) {
115+
int32_t res = 1;
116+
Py_ssize_t sz; char* buf;
117+
118+
if (PyBytes_CheckExact(data)) {
119+
sz = PyBytes_GET_SIZE(data);
120+
buf = PyBytes_AS_STRING(data);
121+
}
122+
else if (PyByteArray_CheckExact(data)) {
123+
sz = PyByteArray_GET_SIZE(data);
124+
buf = PyByteArray_AS_STRING(data);
125+
}
126+
else if (PyMemoryView_Check(data)) {
127+
Py_buffer* pyBuf = PyMemoryView_GET_BUFFER(data);
128+
sz = pyBuf->len;
129+
buf = (char*)pyBuf->buf;
130+
}
131+
else {
132+
PyErr_SetString(PyExc_ValueError, hashcode_input_err);
133+
return NULL;
134+
}
135+
136+
Py_ssize_t i;
137+
for (i = 0; i < sz; i++) {
138+
res = 31 * res + (signed char)buf[i];
139+
}
140+
141+
return PyLong_FromLong(res);
142+
}
143+
144+
PyObject* schema_id(PyObject* self, PyObject *args) {
145+
PyObject* data;
146+
147+
if (!PyArg_ParseTuple(args, "O", &data)) {
148+
return NULL;
149+
}
150+
151+
if (PyLong_CheckExact(data)) {
152+
return PyNumber_Long(data);
153+
}
154+
else if (data == Py_None) {
155+
return PyLong_FromLong(0);
156+
}
157+
else if (PyDict_Check(data)) {
158+
Py_ssize_t sz = PyDict_Size(data);
159+
160+
if (sz == 0) {
161+
return PyLong_FromLong(0);
162+
}
163+
164+
int32_t s_id = FNV1_OFFSET_BASIS;
165+
166+
PyObject *key, *value;
167+
Py_ssize_t pos = 0;
168+
169+
while (PyDict_Next(data, &pos, &key, &value)) {
170+
if (!PyUnicode_CheckExact(key)) {
171+
PyErr_SetString(PyExc_ValueError, schema_field_type_err);
172+
return NULL;
173+
}
174+
175+
int32_t field_id = str_hashcode_(key, 1);
176+
s_id ^= field_id & 0xff;
177+
s_id *= FNV1_PRIME;
178+
s_id ^= (field_id >> 8) & 0xff;
179+
s_id *= FNV1_PRIME;
180+
s_id ^= (field_id >> 16) & 0xff;
181+
s_id *= FNV1_PRIME;
182+
s_id ^= (field_id >> 24) & 0xff;
183+
s_id *= FNV1_PRIME;
184+
}
185+
186+
return PyLong_FromLong(s_id);
187+
}
188+
else {
189+
PyErr_SetString(PyExc_ValueError, schema_id_input_err);
190+
return NULL;
191+
}
192+
}

pygridgain/api/binary.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from pygridgain.datatypes import String, Int, Bool
2323
from pygridgain.queries import Query
2424
from pygridgain.queries.op_codes import *
25-
from pygridgain.utils import int_overflow, entity_id
25+
from pygridgain.utils import entity_id, schema_id
2626
from .result import APIResult
2727
from ..stream import BinaryStream, READ_BACKWARD
2828
from ..queries.response import Response
@@ -137,7 +137,7 @@ def put_binary_type(
137137
'is_enum': is_enum,
138138
'schema': [],
139139
}
140-
schema_id = None
140+
s_id = None
141141
if is_enum:
142142
data['enums'] = []
143143
for literal, ordinal in schema.items():
@@ -147,7 +147,7 @@ def put_binary_type(
147147
})
148148
else:
149149
# assemble schema and calculate schema ID in one go
150-
schema_id = FNV1_OFFSET_BASIS if schema else 0
150+
s_id = schema_id(schema)
151151
for field_name, data_type in schema.items():
152152
# TODO: check for allowed data types
153153
field_id = entity_id(field_name)
@@ -159,17 +159,9 @@ def put_binary_type(
159159
),
160160
'field_id': field_id,
161161
})
162-
schema_id ^= (field_id & 0xff)
163-
schema_id = int_overflow(schema_id * FNV1_PRIME)
164-
schema_id ^= ((field_id >> 8) & 0xff)
165-
schema_id = int_overflow(schema_id * FNV1_PRIME)
166-
schema_id ^= ((field_id >> 16) & 0xff)
167-
schema_id = int_overflow(schema_id * FNV1_PRIME)
168-
schema_id ^= ((field_id >> 24) & 0xff)
169-
schema_id = int_overflow(schema_id * FNV1_PRIME)
170162

171163
data['schema'].append({
172-
'schema_id': schema_id,
164+
'schema_id': s_id,
173165
'schema_fields': [
174166
{'schema_field_id': entity_id(x)} for x in schema
175167
],

pygridgain/utils.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@
2323
from pygridgain.datatypes.base import GridGainDataType
2424
from .constants import *
2525

26+
FALLBACK = False
27+
28+
try:
29+
from pygridgain import _cutils
30+
except ImportError:
31+
FALLBACK = True
32+
2633

2734
LONG_MASK = 0xffffffff
2835
DIGITS_PER_INT = 9
@@ -91,6 +98,13 @@ def hashcode(data: Union[str, bytes, bytearray, memoryview]) -> int:
9198
:param data: UTF-8-encoded string identifier of binary buffer or byte array
9299
:return: hash code.
93100
"""
101+
if FALLBACK:
102+
return __hashcode_fallback(data)
103+
104+
return _cutils.hashcode(data)
105+
106+
107+
def __hashcode_fallback(data: Union[str, bytes, bytearray, memoryview]) -> int:
94108
if data is None:
95109
return 0
96110

@@ -150,13 +164,21 @@ def schema_id(schema: Union[int, dict]) -> int:
150164
:param schema: a dict of field names: field types,
151165
:return: schema ID.
152166
"""
153-
if type(schema) is int:
167+
if FALLBACK:
168+
return __schema_id_fallback(schema)
169+
return _cutils.schema_id(schema)
170+
171+
172+
def __schema_id_fallback(schema: Union[int, dict]) -> int:
173+
if isinstance(schema, int):
154174
return schema
175+
155176
if schema is None:
156177
return 0
178+
157179
s_id = FNV1_OFFSET_BASIS if schema else 0
158180
for field_name in schema.keys():
159-
field_id = entity_id(field_name)
181+
field_id = __hashcode_fallback(field_name.lower())
160182
s_id ^= (field_id & 0xff)
161183
s_id = int_overflow(s_id * FNV1_PRIME)
162184
s_id ^= ((field_id >> 8) & 0xff)

requirements/install.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# these pip packages are necessary for the pygridgain to run
22

3-
typing==3.6.6; python_version<'3.5'
43
attrs==18.1.0
54
tzlocal==2.1

scripts/build_wheels.sh

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#!/bin/bash
2+
#
3+
# Copyright 2021 GridGain Systems, Inc. and Contributors.
4+
#
5+
# Licensed under the GridGain Community Edition License (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# https://www.gridgain.com/products/software/community-edition/gridgain-community-edition-license
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
set -e -u -x
19+
20+
function repair_wheel {
21+
wheel="$1"
22+
if ! auditwheel show "$wheel"; then
23+
echo "Skipping non-platform wheel $wheel"
24+
else
25+
auditwheel repair "$wheel" --plat "$PLAT" -w /wheels
26+
fi
27+
}
28+
29+
# Compile wheels
30+
for PYBIN in /opt/python/*/bin; do
31+
if [[ $PYBIN =~ ^(.*)cp3[6789](.*)$ ]]; then
32+
"${PYBIN}/pip" wheel /pygridgain/ --no-deps -w /wheels
33+
fi
34+
done
35+
36+
# Bundle external shared libraries into the wheels
37+
for whl in /wheels/*.whl; do
38+
repair_wheel "$whl"
39+
done
40+
41+
for whl in /wheels/*.whl; do
42+
if [[ ! $whl =~ ^(.*)manylinux(.*)$ ]]; then
43+
rm "$whl"
44+
else
45+
chmod 666 "$whl"
46+
fi
47+
done
48+
49+
rm -rf /pygridgain/*.egg-info
50+
rm -rf /pygridgain/.eggs

0 commit comments

Comments
 (0)