Skip to content

Commit b7f2937

Browse files
committed
Changed processing to produce less features
1 parent 1d8b9a8 commit b7f2937

File tree

3 files changed

+33
-17
lines changed

3 files changed

+33
-17
lines changed

SecurityKeywordsBasedSearchTool/SecFeatFinder/FeatureModel.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ def add_to_fm(fm, taxonomy, feature_name, tag):
2424
break
2525
if not exists:
2626
fm = Feature(feature.name, fm)
27+
for feature in fm.sub_features:
28+
if feature.name == tag:
29+
return feature
2730
return Feature(tag, fm)
2831

2932

SecurityKeywordsBasedSearchTool/SecFeatFinder/SecList.json

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
22
"Access_Control": {
33
"Authentication": [
4-
"Authentication",
5-
"Password",
4+
"[Aa]uthenticat*",
5+
"[Pp]assword",
66
"Credential",
77
"OTP",
88
"X509",
@@ -16,7 +16,7 @@
1616
"IdP"
1717
],
1818
"Authorization": [
19-
"authoriz*",
19+
"[Aa]uthoriz*",
2020
"AccessManager",
2121
"Role",
2222
"Privilege",
@@ -68,8 +68,7 @@
6868
],
6969
"Steganography": [
7070
"ImageProcessor",
71-
"Stego",
72-
"pad(",
71+
"[Ss]tego",
7372
"[Bb]ase64"
7473
]
7574
},
@@ -93,11 +92,11 @@
9392
},
9493
"Secure_Data_Handling": {
9594
"Data_Validation": [
96-
"validate(",
95+
"validate",
9796
"Whitelist",
9897
"Blacklist",
9998
"AntiSamy",
100-
"Commons Validator"
99+
"CommonsValidator"
101100
],
102101
"Data_Sanitization": [
103102
"sanitize",

SecurityKeywordsBasedSearchTool/SecFeatFinder/main.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ def process_feature_annotations(features_file, repo_dir, flattened_keywords, tax
3535

3636
library_features = set()
3737

38-
api_id = 1 # Initialize API ID for tagging
3938
for source in data.get('sources', []):
4039
for feature in source.get('files', []):
4140
file_path = os.path.join(repo_dir, feature.get('path', ''))
@@ -55,8 +54,7 @@ def process_feature_annotations(features_file, repo_dir, flattened_keywords, tax
5554

5655
if feature_names and line_index < len(lines):
5756
for feature_name in feature_names:
58-
tag = f"API_{api_id}_{feature_name}_{method_name}"
59-
api_id += 1
57+
tag = f"APIMatch|{feature_name}|{method_name}"
6058
line_annotations[line_index].add(tag)
6159
library_features.add(tag)
6260
if add_to_fm(fm, taxonomy, feature_name, tag) is None:
@@ -174,7 +172,7 @@ def search_keywords_in_file(file_path, flattened_keywords, repo_dir,
174172
# Search only non-comment, non-test, non-HAnS-annotated lines
175173
keywords_found = {}
176174
for category, subcategory, keyword in flattened_keywords:
177-
if re.search(rf"\b{re.escape(keyword)}\b", cleaned_line):
175+
if re.search(rf"\b{keyword}\b", cleaned_line, re.IGNORECASE):
178176
key = f"{category} : {subcategory}"
179177
if key not in keywords_found:
180178
keywords_found[key] = []
@@ -226,14 +224,23 @@ def search_keywords_in_file(file_path, flattened_keywords, repo_dir,
226224

227225
def determine_feature(pos_counter, matches, line_number, fm):
228226
features = ''
229-
for match in list(matches[line_number]["Keywords Found"].keys()):
227+
for match in list(matches[line_number]["Keywords Found"].items()):
230228
if len(features) > 0:
231229
features += ', '
232-
path = match.split(' : ')
230+
path = match[0].split(' : ')
233231
length = len(path)
234-
feature = 'KeywordMatch_' + str(pos_counter[0]) + '_' + path[length - 1]
235-
pos_counter[0] += 1
236-
features += feature
232+
value = (
233+
match[1][0]
234+
.replace('[', '')
235+
.replace(']', '')
236+
.replace('(', '')
237+
.replace(')', '')
238+
.replace('*', '')
239+
.replace('?', '')
240+
)
241+
242+
feature_name = f'KeywordMatch|{path[length - 1]}|{value}'
243+
features += feature_name
237244

238245
current = fm
239246
i = 0
@@ -248,7 +255,14 @@ def determine_feature(pos_counter, matches, line_number, fm):
248255
if not found:
249256
current = Feature(name, current)
250257
i += 1
251-
Feature(feature, current)
258+
259+
exists = False
260+
for f in current.sub_features:
261+
if f.name == feature_name:
262+
exists = True
263+
break
264+
if not exists:
265+
Feature(feature_name, current)
252266
return features, fm
253267

254268

0 commit comments

Comments
 (0)