Implements sha256 exception functionality
[releng-anteater.git] / anteater / src / project_scan.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 ##############################################################################
4 # Copyright (c) 2017 Luke Hinds <lhinds@redhat.com>, Red Hat
5 #
6 # All rights reserved. This program and the accompanying materials
7 # are made available under the terms of the Apache License, Version 2.0
8 # which accompanies this distribution, and is available at
9 # http://www.apache.org/licenses/LICENSE-2.0
10 ##############################################################################
11
12 """
13     Accepts the --path argument and iterates the root directory using os.walk
14     If a file is a binary, or contains a blacklisted string. If any violations
15     are found, the script adds the violation to a log file.
16 """
17
18 from __future__ import division, print_function, absolute_import
19 import ConfigParser
20 import hashlib
21 import os
22 import re
23 import anteater.utils.anteater_logger as antlog
24 import anteater.src.get_lists as get_lists
25 from binaryornot.check import is_binary
26
27 logger = antlog.Logger(__name__).getLogger()
28 config = ConfigParser.RawConfigParser()
29 config.read('anteater.conf')
30 reports_dir = config.get('config', 'reports_dir')
31 master_list = config.get('config', 'master_list')
32 ignore_dirs = ['.git']
33 hasher = hashlib.sha256()
34
35
36 def prepare_project(project, project_dir):
37     """ Generates blacklists / whitelists and calls main functions """
38
39     # Get Various Lists / Project Waivers
40     lists = get_lists.GetLists()
41
42     # Get binary white list
43     binary_list = lists.binary_list(project)
44
45     # Get file name black list and project waivers
46     file_audit_list, file_audit_project_list = lists.file_audit_list(project)
47
48     # Get file content black list and project waivers
49     file_content_list, project_content_list = lists.file_content_list(project)
50
51     # Get Licence Lists
52     licence_ext = lists.licence_extensions()
53     licence_ignore = lists.licence_ignore()
54
55     # Perform rudimentary scans
56     scan_file(project_dir, project, binary_list,file_audit_list,
57               file_audit_project_list, file_content_list,
58               project_content_list)
59
60     # Perform licence header checks
61     licence_check(licence_ext, licence_ignore, project, project_dir)
62     licence_root_check(project_dir, project)
63
64
65 def scan_file(project_dir, project, binary_list, file_audit_list,
66               file_audit_project_list, file_content_list,
67               project_content_list):
68     """Searches for banned strings and files that are listed """
69     for root, dirs, files in os.walk(project_dir):
70         # Filter out ignored directories from list.
71         dirs[:] = [d for d in dirs if d not in ignore_dirs]
72         for items in files:
73             full_path = os.path.join(root, items)
74             # Check for Blacklisted file names
75             if file_audit_list.search(full_path) and not \
76                     file_audit_project_list.search(full_path):
77                 match = file_audit_list.search(full_path)
78                 logger.error('Blacklisted filename: {0}'.
79                              format(full_path))
80                 logger.error('Matched String: {0}'.
81                              format(match.group()))
82                 with open(reports_dir + "file-names_" + project + ".log",
83                           "a") as gate_report:
84                             gate_report. \
85                                 write('Blacklisted filename: {0}\n'.
86                                       format(full_path))
87                             gate_report. \
88                                 write('Matched String: {0}'.
89                                       format(match.group()))
90
91             if not is_binary(full_path):
92                 fo = open(full_path, 'r')
93                 lines = fo.readlines()
94                 for line in lines:
95                     # Check for sensitive content in project files
96                     if file_content_list.search(line) and not \
97                             project_content_list.search(line):
98                         match = file_content_list.search(line)
99                         logger.error('File contains violation: {0}'.
100                                      format(full_path))
101                         logger.error('Flagged Content: {0}'.
102                                      format(line.rstrip()))
103                         logger.error('Matched String: {0}'.
104                                      format(match.group()))
105                         with open(reports_dir + "contents-" + project + ".log",
106                                   "a") \
107                                 as gate_report:
108                                     gate_report. \
109                                         write('File contains violation: {0}\n'.
110                                               format(full_path))
111                                     gate_report. \
112                                         write('Flagged Content: {0}'.
113                                               format(line))
114                                     gate_report. \
115                                         write('Matched String: {0}\n'.
116                                               format(match.group()))
117             else:
118                 # Check if Binary is whitelisted
119                 hashlist = get_lists.GetLists()
120                 binary_hash = hashlist.binary_hash(project, full_path)
121                 if not binary_list.search(full_path):
122                     with open(full_path, 'rb') as afile:
123                         buf = afile.read()
124                         hasher.update(buf)
125                     if hasher.hexdigest() in binary_hash:
126                         logger.info('Found matching file hash for file: {0}'.
127                                     format(full_path))
128                     else:
129                         logger.error('Non Whitelisted Binary file: {0}'.
130                                      format(full_path))
131                         logger.error('Please submit patch with this hash: {0}'.
132                                      format(hasher.hexdigest()))
133                         with open(reports_dir + "binaries-" + project + ".log",
134                                   "a") \
135                                 as gate_report:
136                             gate_report.write('Non Whitelisted Binary: {0}\n'.
137                                               format(full_path))
138
139
140 def licence_root_check(project_dir, project):
141     if os.path.isfile(project_dir + '/LICENSE'):
142         logger.info('LICENSE file present in: {0}'.
143                     format(project_dir))
144     else:
145         logger.error('LICENSE file missing in: {0}'.
146                      format(project_dir))
147         with open(reports_dir + "licence-" + project + ".log",
148                   "a") \
149                 as gate_report:
150             gate_report.write('LICENSE file missing in: {0}\n'.
151                               format(project_dir))
152
153
154 def licence_check(licence_ext, licence_ignore, project, project_dir):
155     """ Peform basic checks for the presence of licence strings """
156     for root, dirs, files in os.walk(project_dir):
157         dirs[:] = [d for d in dirs if d not in ignore_dirs]
158         for file in files:
159             if file.endswith(tuple(licence_ext)) \
160                     and file not in licence_ignore:
161                 full_path = os.path.join(root, file)
162                 if not is_binary(full_path):
163                     fo = open(full_path, 'r')
164                     content = fo.read()
165                     # Note: Hardcoded use of 'copyright' & 'spdx' is the result
166                     # of a decision made at 2017 plugfest to limit searches to
167                     # just these two strings.
168                     if re.search("copyright", content, re.IGNORECASE):
169                         logger.info('Licence string present: {0}'.
170                                     format(full_path))
171                     elif re.search("spdx", content, re.IGNORECASE):
172                         logger.info('Licence string present: {0}'.
173                                     format(full_path))
174                     else:
175                         logger.error('Licence header missing: {0}'.
176                                      format(full_path))
177                         with open(reports_dir + "licence-" + project + ".log",
178                                   "a") \
179                                 as gate_report:
180                             gate_report.write('Licence header missing: {0}\n'.
181                                               format(full_path))