c7c6f286e5fa2f2dcec7b8ce873ec82fe55e8780
[releng-anteater.git] / anteater / src / project_scan.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 ##############################################################################
4 # Copyright (c) 2017 Luke Hinds <lhinds@redhat.com>, Red Hat
5 #
6 # All rights reserved. This program and the accompanying materials
7 # are made available under the terms of the Apache License, Version 2.0
8 # which accompanies this distribution, and is available at
9 # http://www.apache.org/licenses/LICENSE-2.0
10 ##############################################################################
11
12 """
13     Accepts the --path argument and iterates the root directory using os.walk
14     If a file is a binary, or contains a blacklisted string. If any violations
15     are found, the script adds the violation to a log file.
16 """
17
18 from __future__ import division, print_function, absolute_import
19 import ConfigParser
20 import os
21 import re
22 import anteater.utils.anteater_logger as antlog
23 import anteater.src.get_lists as get_lists
24 from binaryornot.check import is_binary
25
26 logger = antlog.Logger(__name__).getLogger()
27 config = ConfigParser.RawConfigParser()
28 config.read('anteater.conf')
29 reports_dir = config.get('config', 'reports_dir')
30 master_list = config.get('config', 'master_list')
31 ignore_dirs = ['.git']
32
33
34 def prepare_project(project, project_dir):
35     """ Generates blacklists / whitelists and calls main functions """
36
37     # Get Various Lists / Project Waivers
38     lists = get_lists.GetLists()
39
40     # Get binary white list
41     binary_list, binary_project_list = lists.binary_list(project)
42
43     # Get file name black list and project waivers
44     file_audit_list, file_audit_project_list = lists.file_audit_list(project)
45
46     # Get file content black list and project waivers
47     file_content_list, project_content_list = lists.file_content_list(project)
48
49     # Get Licence Lists
50     licence_ext = lists.licence_extensions()
51     licence_ignore = lists.licence_ignore()
52
53     # Perform rudimentary scans
54     scan_file(project_dir, project, binary_list, binary_project_list,
55               file_audit_list, file_audit_project_list, file_content_list,
56               project_content_list)
57
58     # Perform licence header checks
59     licence_check(licence_ext, licence_ignore, project, project_dir)
60     licence_root_check(project_dir, project)
61
62
63 def scan_file(project_dir, project, binary_list, binary_project_list,
64               file_audit_list, file_audit_project_list, file_content_list,
65               project_content_list):
66     """Searches for banned strings and files that are listed """
67     for root, dirs, files in os.walk(project_dir):
68         # Filter out ignored directories from list.
69         dirs[:] = [d for d in dirs if d not in ignore_dirs]
70         for items in files:
71             full_path = os.path.join(root, items)
72             # Check for Blacklisted file names
73             if file_audit_list.search(full_path) and not \
74                     file_audit_project_list.search(full_path):
75                 match = file_audit_list.search(full_path)
76                 logger.error('Blacklisted filename: {0}'.
77                              format(full_path))
78                 logger.error('Matched String: {0}'.
79                              format(match.group()))
80                 with open(reports_dir + "file-names_" + project + ".log",
81                           "a") as gate_report:
82                             gate_report. \
83                                 write('Blacklisted filename: {0}\n'.
84                                       format(full_path))
85                             gate_report. \
86                                 write('Matched String: {0}'.
87                                       format(match.group()))
88
89             if not is_binary(full_path):
90                 fo = open(full_path, 'r')
91                 lines = fo.readlines()
92                 for line in lines:
93                     # Check for sensitive content in project files
94                     if file_content_list.search(line) and not \
95                             project_content_list.search(line):
96                         match = file_content_list.search(line)
97                         logger.error('File contains violation: {0}'.
98                                      format(full_path))
99                         logger.error('Flagged Content: {0}'.
100                                      format(line.rstrip()))
101                         logger.error('Matched String: {0}'.
102                                      format(match.group()))
103                         with open(reports_dir + "contents-" + project + ".log",
104                                   "a") \
105                                 as gate_report:
106                                     gate_report. \
107                                         write('File contains violation: {0}\n'.
108                                               format(full_path))
109                                     gate_report. \
110                                         write('Flagged Content: {0}'.
111                                               format(line))
112                                     gate_report. \
113                                         write('Matched String: {0}\n'.
114                                               format(match.group()))
115             else:
116                 # Check if Binary is whitelisted
117                 if not binary_list.search(full_path) \
118                         and not binary_project_list.search(full_path):
119                     logger.error('Non Whitelisted Binary: {0}'.
120                                  format(full_path))
121                     with open(reports_dir + "binaries-" + project + ".log",
122                               "a") \
123                             as gate_report:
124                         gate_report.write('Non Whitelisted Binary: {0}\n'.
125                                           format(full_path))
126
127
128 def licence_root_check(project_dir, project):
129     if os.path.isfile(project_dir + '/LICENSE'):
130         logger.info('LICENSE file present in: {0}'.
131                     format(project_dir))
132     else:
133         logger.error('LICENSE file missing in: {0}'.
134                      format(project_dir))
135         with open(reports_dir + "licence-" + project + ".log",
136                   "a") \
137                 as gate_report:
138             gate_report.write('LICENSE file missing in: {0}\n'.
139                               format(project_dir))
140
141
142 def licence_check(licence_ext, licence_ignore, project, project_dir):
143     """ Peform basic checks for the presence of licence strings """
144     for root, dirs, files in os.walk(project_dir):
145         dirs[:] = [d for d in dirs if d not in ignore_dirs]
146         for file in files:
147             if file.endswith(tuple(licence_ext)) \
148                     and file not in licence_ignore:
149                 full_path = os.path.join(root, file)
150                 if not is_binary(full_path):
151                     fo = open(full_path, 'r')
152                     content = fo.read()
153                     # Note: Hardcoded use of 'copyright' & 'spdx' is the result
154                     # of a decision made at 2017 plugfest to limit searches to
155                     # just these two strings.
156                     if re.search("copyright", content, re.IGNORECASE):
157                         logger.info('Licence string present: {0}'.
158                                     format(full_path))
159                     elif re.search("spdx", content, re.IGNORECASE):
160                         logger.info('Licence string present: {0}'.
161                                     format(full_path))
162                     else:
163                         logger.error('Licence header missing: {0}'.
164                                      format(full_path))
165                         with open(reports_dir + "licence-" + project + ".log",
166                                   "a") \
167                                 as gate_report:
168                             gate_report.write('Licence header missing: {0}\n'.
169                                               format(full_path))