f33ccca67d6695510e1e4cb049da93db2d82d6f0
[releng-anteater.git] / anteater / src / project_scan.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 ##############################################################################
4 # Copyright (c) 2017 Luke Hinds <lhinds@redhat.com>, Red Hat
5 #
6 # All rights reserved. This program and the accompanying materials
7 # are made available under the terms of the Apache License, Version 2.0
8 # which accompanies this distribution, and is available at
9 # http://www.apache.org/licenses/LICENSE-2.0
10 ##############################################################################
11
12 """
13     Accepts the --path argument and iterates the root directory using os.walk
14     If a file is a binary, or contains a blacklisted string. If any violations
15     are found, the script adds the violation to a log file.
16 """
17
18 from __future__ import division, print_function, absolute_import
19 import hashlib
20 import six.moves.configparser
21 import os
22 import re
23 import logging
24 from binaryornot.check import is_binary
25
26 from . import get_lists
27
28 logger = logging.getLogger(__name__)
29 config = six.moves.configparser.RawConfigParser()
30 config.read('anteater.conf')
31 reports_dir = config.get('config', 'reports_dir')
32 master_list = config.get('config', 'master_list')
33 ignore_dirs = ['.git']
34 hasher = hashlib.sha256()
35
36
37 def prepare_project(project, project_dir):
38     """ Generates blacklists / whitelists and calls main functions """
39
40     # Get Various Lists / Project Waivers
41     lists = get_lists.GetLists()
42
43     # Get binary white list
44     binary_list = lists.binary_list(project)
45
46     # Get file name black list and project waivers
47     file_audit_list, file_audit_project_list = lists.file_audit_list(project)
48
49     # Get file content black list and project waivers
50     file_content_list, project_content_list = lists.file_content_list(project)
51
52     # Get Licence Lists
53     licence_ext = lists.licence_extensions()
54     licence_ignore = lists.licence_ignore()
55
56     # Perform rudimentary scans
57     scan_file(project_dir, project, binary_list,file_audit_list,
58               file_audit_project_list, file_content_list,
59               project_content_list)
60
61     # Perform licence header checks
62     licence_check(licence_ext, licence_ignore, project, project_dir)
63     licence_root_check(project_dir, project)
64
65
66 def scan_file(project_dir, project, binary_list, file_audit_list,
67               file_audit_project_list, file_content_list,
68               project_content_list):
69     """Searches for banned strings and files that are listed """
70     for root, dirs, files in os.walk(project_dir):
71         # Filter out ignored directories from list.
72         dirs[:] = [d for d in dirs if d not in ignore_dirs]
73         for items in files:
74             full_path = os.path.join(root, items)
75             # Check for Blacklisted file names
76             if file_audit_list.search(full_path) and not \
77                     file_audit_project_list.search(full_path):
78                 match = file_audit_list.search(full_path)
79                 logger.error('Blacklisted filename: %s', full_path)
80                 logger.error('Matched String: %s', match.group())
81                 with open(reports_dir + "file-names_" + project + ".log",
82                           "a") as gate_report:
83                             gate_report. \
84                                 write('Blacklisted filename: {0}\n'.
85                                       format(full_path))
86                             gate_report. \
87                                 write('Matched String: {0}'.
88                                       format(match.group()))
89
90             if not is_binary(full_path):
91                 fo = open(full_path, 'r')
92                 lines = fo.readlines()
93                 for line in lines:
94                     # Check for sensitive content in project files
95                     if file_content_list.search(line) and not \
96                             project_content_list.search(line):
97                         match = file_content_list.search(line)
98                         logger.error('File contains violation: %s', full_path)
99                         logger.error('Flagged Content: %s', line.rstrip())
100                         logger.error('Matched String: %s', match.group())
101                         with open(reports_dir + "contents-" + project + ".log",
102                                   "a") \
103                                 as gate_report:
104                                     gate_report. \
105                                         write('File contains violation: {0}\n'.
106                                               format(full_path))
107                                     gate_report. \
108                                         write('Flagged Content: {0}'.
109                                               format(line))
110                                     gate_report. \
111                                         write('Matched String: {0}\n'.
112                                               format(match.group()))
113             else:
114                 # Check if Binary is whitelisted
115                 hashlist = get_lists.GetLists()
116                 binary_hash = hashlist.binary_hash(project, full_path)
117                 if not binary_list.search(full_path):
118                     with open(full_path, 'rb') as afile:
119                         buf = afile.read()
120                         hasher.update(buf)
121                     if hasher.hexdigest() in binary_hash:
122                         logger.info('Found matching file hash for file: %s',
123                                     full_path)
124                     else:
125                         logger.error('Non Whitelisted Binary file: %s',
126                                      full_path)
127                         logger.error('Please submit patch with this hash: %s',
128                                      hasher.hexdigest())
129                         with open(reports_dir + "binaries-" + project + ".log",
130                                   "a") \
131                                 as gate_report:
132                             gate_report.write('Non Whitelisted Binary: {0}\n'.
133                                               format(full_path))
134
135
136 def licence_root_check(project_dir, project):
137     if os.path.isfile(project_dir + '/LICENSE'):
138         logger.info('LICENSE file present in: %s', project_dir)
139     else:
140         logger.error('LICENSE file missing in: %s', project_dir)
141         with open(reports_dir + "licence-" + project + ".log",
142                   "a") \
143                 as gate_report:
144             gate_report.write('LICENSE file missing in: {0}\n'.
145                               format(project_dir))
146
147
148 def licence_check(licence_ext, licence_ignore, project, project_dir):
149     """ Peform basic checks for the presence of licence strings """
150     for root, dirs, files in os.walk(project_dir):
151         dirs[:] = [d for d in dirs if d not in ignore_dirs]
152         for file in files:
153             if file.endswith(tuple(licence_ext)) \
154                     and file not in licence_ignore:
155                 full_path = os.path.join(root, file)
156                 if not is_binary(full_path):
157                     fo = open(full_path, 'r')
158                     content = fo.read()
159                     # Note: Hardcoded use of 'copyright' & 'spdx' is the result
160                     # of a decision made at 2017 plugfest to limit searches to
161                     # just these two strings.
162                     if re.search("copyright", content, re.IGNORECASE):
163                         logger.info('Licence string present: %s', full_path)
164                     elif re.search("spdx", content, re.IGNORECASE):
165                         logger.info('Licence string present: %s', full_path)
166                     else:
167                         logger.error('Licence header missing: %s', full_path)
168                         with open(reports_dir + "licence-" + project + ".log",
169                                   "a") \
170                                 as gate_report:
171                             gate_report.write('Licence header missing: {0}\n'.
172                                               format(full_path))