Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,6 @@ is provided at the docstring of each class.
See also [here](#approximate-duplicate-code-detection) for a command line tool.
* [`treesitter.parser_for`](python/dpu_utils/codeutils/treesitter/parser.py) get [Tree-sitter](https://tree-sitter.github.io/tree-sitter/) parser by language name.

##### TensorFlow 1.x Utilities `dpu_utils.tfutils`
* [`get_activation`](python/dpu_utils/tfutils/activation.py) retrieve activations function by name.
* [`GradRatioLoggingOptimizer`](python/dpu_utils/tfutils/gradratiologgingoptimizer.py) a wrapper around optimizers that logs the ratios of grad norms to parameter norms.
* [`TFVariableSaver`](python/dpu_utils/tfutils/tfvariablesaver.py) save TF variables in an object that can be pickled.

Unsorted segment operations following TensorFlow's [`unsorted_segment_sum`](https://www.tensorflow.org/api_docs/python/tf/math/unsorted_segment_sum) operations:
* [`unsorted_segment_logsumexp`](python/dpu_utils/tfutils/unsortedsegmentops.py)
* [`unsorted_segment_log_softmax`](python/dpu_utils/tfutils/unsortedsegmentops.py)
* [`unsorted_segment_softmax`](python/dpu_utils/tfutils/unsortedsegmentops.py)

##### TensorFlow 2.x Utilities `dpu_utils.tf2utils`
* [`get_activation_function_by_name`](python/dpu_utils/tf2utils/activation.py) retrieve activation functions by name.
* [`gelu`](python/dpu_utils/tf2utils/activation.py) The GeLU activation function.
Expand All @@ -69,18 +59,6 @@ Unsorted segment operations following TensorFlow's [`unsorted_segment_sum`](http
* [`unsorted_segment_softmax`](python/dpu_utils/tf2utils/unsorted_segment_ops.py)


##### TensorFlow Models `dpu_utils.tfmodels`
* [`SparseGGNN`](python/dpu_utils/tfmodels/sparsegnn.py) a sparse GGNN implementation.
* [`AsyncGGNN`](python/dpu_utils/tfmodels/asyncgnn.py) an asynchronous GGNN implementation.

These models have not been tested with TF 2.0.

##### PyTorch Utilities `dpu_utils.ptutils`
* [`BaseComponent`](python/dpu_utils/ptutils/basecomponent.py) a wrapper abstract class around `nn.Module` that
takes care of essential elements of most neural network components.
* [`ComponentTrainer`](python/dpu_utils/ptutils/basecomponent.py) a training loop for `BaseComponent`s.


### Command-line tools

#### Approximate Duplicate Code Detection
Expand Down
1 change: 1 addition & 0 deletions __azurite_db_blob__.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"filename":"/mnt/c/Users/miallama/Documents/src/dpu-utils/__azurite_db_blob__.json","collections":[{"name":"$SERVICES_COLLECTION$","data":[],"idIndex":null,"binaryIndices":{},"constraints":null,"uniqueNames":["accountName"],"transforms":{},"objType":"$SERVICES_COLLECTION$","dirty":false,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":true,"transactional":false,"cloneObjects":false,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]},{"name":"$CONTAINERS_COLLECTION$","data":[{"accountName":"devstoreaccount1","name":"test1","properties":{"etag":"\"0x202253268606A80\"","lastModified":"2021-02-09T09:42:06.576Z","leaseStatus":"unlocked","leaseState":"available","hasImmutabilityPolicy":false,"hasLegalHold":false},"meta":{"revision":0,"created":1612863726585,"version":0},"$loki":1}],"idIndex":null,"binaryIndices":{"accountName":{"name":"accountName","dirty":false,"values":[0]},"name":{"name":"name","dirty":false,"values":[0]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"$CONTAINERS_COLLECTION$","dirty":false,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":true,"transactional":false,"cloneObjects":false,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":1,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]},{"name":"$BLOBS_COLLECTION$","data":[],"idIndex":[],"binaryIndices":{"accountName":{"name":"accountName","dirty":false,"values":[]},"containerName":{"name":"containerName","dirty":false,"values":[]},"name":{"name":"name","dirty":false,"values":[]},"snapshot":{"name":"snapshot","dirty":false,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"$BLOBS_COLLECTION$","dirty":false,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":true,"transactional":false,"cloneObjects":false,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":57,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]},{"name":"$BLOCKS_COLLECTION$","data":[],"idIndex":null,"binaryIndices":{"accountName":{"name":"accountName","dirty":false,"values":[]},"containerName":{"name":"containerName","dirty":false,"values":[]},"blobName":{"name":"blobName","dirty":false,"values":[]},"name":{"name":"name","dirty":false,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"$BLOCKS_COLLECTION$","dirty":false,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":true,"transactional":false,"cloneObjects":false,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}],"databaseVersion":1.5,"engineVersion":1.5,"autosave":true,"autosaveInterval":5000,"autosaveHandle":null,"throttledSaves":true,"options":{"autosave":true,"autosaveInterval":5000,"serializationMethod":"normal","destructureDelimiter":"$<\n"},"persistenceMethod":"fs","persistenceAdapter":null,"verbose":false,"events":{"init":[null],"loaded":[],"flushChanges":[],"close":[],"changes":[],"warning":[]},"ENV":"NODEJS"}
1 change: 1 addition & 0 deletions __azurite_db_blob_extent__.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"filename":"/mnt/c/Users/miallama/Documents/src/dpu-utils/__azurite_db_blob_extent__.json","collections":[{"name":"$EXTENTS_COLLECTION$","data":[{"id":"715987c7-5b32-48a5-8994-ec4200188d4a","locationId":"Default","path":"715987c7-5b32-48a5-8994-ec4200188d4a","size":33890,"lastModifiedInMS":1613484063683,"meta":{"revision":9,"created":1613484063684,"version":0,"updated":1613484064944},"$loki":2,"LastModifyInMS":1613484064944}],"idIndex":[2],"binaryIndices":{"id":{"name":"id","dirty":false,"values":[0]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"$EXTENTS_COLLECTION$","dirty":false,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":true,"transactional":false,"cloneObjects":false,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":2,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}],"databaseVersion":1.5,"engineVersion":1.5,"autosave":true,"autosaveInterval":5000,"autosaveHandle":null,"throttledSaves":true,"options":{"autosave":true,"autosaveInterval":5000,"serializationMethod":"normal","destructureDelimiter":"$<\n"},"persistenceMethod":"fs","persistenceAdapter":null,"verbose":false,"events":{"init":[null],"loaded":[],"flushChanges":[],"close":[],"changes":[],"warning":[]},"ENV":"NODEJS"}
Binary file not shown.
1 change: 1 addition & 0 deletions python/__azurite_db_blob__.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"filename":"/mnt/c/Users/miallama/Documents/src/dpu-utils/python/__azurite_db_blob__.json","collections":[{"name":"$SERVICES_COLLECTION$","data":[],"idIndex":null,"binaryIndices":{},"constraints":null,"uniqueNames":["accountName"],"transforms":{},"objType":"$SERVICES_COLLECTION$","dirty":false,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":true,"transactional":false,"cloneObjects":false,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]},{"name":"$CONTAINERS_COLLECTION$","data":[{"accountName":"devstoreaccount1","name":"test1","properties":{"etag":"\"0x20DFF9BFFA6CE20\"","lastModified":"2021-02-16T15:31:20.786Z","leaseStatus":"unlocked","leaseState":"available","hasImmutabilityPolicy":false,"hasLegalHold":false},"meta":{"revision":0,"created":1613489480796,"version":0},"$loki":1}],"idIndex":null,"binaryIndices":{"accountName":{"name":"accountName","dirty":false,"values":[0]},"name":{"name":"name","dirty":false,"values":[0]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"$CONTAINERS_COLLECTION$","dirty":false,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":true,"transactional":false,"cloneObjects":false,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":1,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]},{"name":"$BLOBS_COLLECTION$","data":[],"idIndex":[],"binaryIndices":{"accountName":{"name":"accountName","dirty":false,"values":[]},"containerName":{"name":"containerName","dirty":false,"values":[]},"name":{"name":"name","dirty":false,"values":[]},"snapshot":{"name":"snapshot","dirty":false,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"$BLOBS_COLLECTION$","dirty":false,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":true,"transactional":false,"cloneObjects":false,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":11,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]},{"name":"$BLOCKS_COLLECTION$","data":[],"idIndex":null,"binaryIndices":{"accountName":{"name":"accountName","dirty":false,"values":[]},"containerName":{"name":"containerName","dirty":false,"values":[]},"blobName":{"name":"blobName","dirty":false,"values":[]},"name":{"name":"name","dirty":false,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"$BLOCKS_COLLECTION$","dirty":false,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":true,"transactional":false,"cloneObjects":false,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}],"databaseVersion":1.5,"engineVersion":1.5,"autosave":true,"autosaveInterval":5000,"autosaveHandle":null,"throttledSaves":true,"options":{"autosave":true,"autosaveInterval":5000,"serializationMethod":"normal","destructureDelimiter":"$<\n"},"persistenceMethod":"fs","persistenceAdapter":null,"verbose":false,"events":{"init":[null],"loaded":[],"flushChanges":[],"close":[],"changes":[],"warning":[]},"ENV":"NODEJS"}
1 change: 1 addition & 0 deletions python/__azurite_db_blob_extent__.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"filename":"/mnt/c/Users/miallama/Documents/src/dpu-utils/python/__azurite_db_blob_extent__.json","collections":[{"name":"$EXTENTS_COLLECTION$","data":[{"id":"3f3dd13c-ce2f-46c3-bf31-4ebd32f54594","locationId":"Default","path":"3f3dd13c-ce2f-46c3-bf31-4ebd32f54594","size":33890,"lastModifiedInMS":1613489480814,"meta":{"revision":9,"created":1613489480814,"version":0,"updated":1613489481821},"$loki":1,"LastModifyInMS":1613489481821}],"idIndex":[1],"binaryIndices":{"id":{"name":"id","dirty":false,"values":[0]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"$EXTENTS_COLLECTION$","dirty":false,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":true,"transactional":false,"cloneObjects":false,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":1,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}],"databaseVersion":1.5,"engineVersion":1.5,"autosave":true,"autosaveInterval":5000,"autosaveHandle":null,"throttledSaves":true,"options":{"autosave":true,"autosaveInterval":5000,"serializationMethod":"normal","destructureDelimiter":"$<\n"},"persistenceMethod":"fs","persistenceAdapter":null,"verbose":false,"events":{"init":[null],"loaded":[],"flushChanges":[],"close":[],"changes":[],"warning":[]},"ENV":"NODEJS"}
Binary file not shown.
17 changes: 11 additions & 6 deletions python/dpu_utils/mlutils/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@ def __str__(self):
def get_all_names(self) -> FrozenSet[str]:
return frozenset(self.token_to_id.keys())

def __batch_add_from_counter(self, token_counter: typing.Counter[str], count_threshold: int, max_size: int) -> None:
"""Update dictionary with elements of the token_counter"""
for token, count in token_counter.most_common(max_size):
def __batch_add_from_counter(self, token_counter: typing.Counter[str], count_threshold: int, max_num_elements_to_add: int) -> None:
"""Update dictionary with at most max_num_elements_to_add elements of the token_counter"""
for token, count in token_counter.most_common(max_num_elements_to_add):
if count >= count_threshold:
self.add_or_get_id(token)
else:
Expand All @@ -115,9 +115,14 @@ def create_vocabulary(tokens: Union[Iterable[str], typing.Counter[str]], max_siz
vocab.__batch_add_from_counter(token_counter, count_threshold, max_size - num_base_tokens)
return vocab

def update(self, token_counter: typing.Counter[str], max_size: int, count_threshold: int=5):
assert len(self) < max_size, 'Dictionary is already larger than max_size.'
self.__batch_add_from_counter(token_counter, count_threshold=count_threshold, max_size=max_size)
def update(self, token_counter: typing.Counter[str], max_num_elements_to_add: int, count_threshold: int=5) -> None:
"""
Update this Vocabulary instance with elements from the new token_counter. This will add at most
max_num_elements_to_add elements from token_counter with a count above the count_threshold.
Elements that already existed will maintain their id, but new ids may be added.
"""
self.__batch_add_from_counter(token_counter, count_threshold=count_threshold,
max_num_elements_to_add=max_num_elements_to_add)

def get_empirical_distribution(self, elements: Iterable[str], dirichlet_alpha: float = 10.) -> np.ndarray:
"""Retrieve empirical distribution of elements."""
Expand Down
2 changes: 0 additions & 2 deletions python/dpu_utils/ptutils/__init__.py

This file was deleted.

Loading