This commit is contained in:
2024-12-09 18:22:38 +09:00
parent ab0cbebefc
commit c4c4547706
959 changed files with 174888 additions and 6 deletions

View File

@ -0,0 +1,30 @@
class BaseTsharkOutputParser:
DEFAULT_BATCH_SIZE = 2 ** 16
async def get_packets_from_stream(self, stream, existing_data, got_first_packet=True):
"""A coroutine which returns a single packet if it can be read from the given StreamReader.
:return a tuple of (packet, remaining_data). The packet will be None if there was not enough XML data to create
a packet. remaining_data is the leftover data which was not enough to create a packet from.
:raises EOFError if EOF was reached.
"""
# yield each packet in existing_data
packet, existing_data = self._extract_packet_from_data(existing_data,
got_first_packet=got_first_packet)
if packet:
packet = self._parse_single_packet(packet)
return packet, existing_data
new_data = await stream.read(self.DEFAULT_BATCH_SIZE)
existing_data += new_data
if not new_data:
raise EOFError()
return None, existing_data
def _parse_single_packet(self, packet):
raise NotImplementedError()
def _extract_packet_from_data(self, data, got_first_packet=True):
"""Returns a packet's data and any remaining data after reading that first packet"""
raise NotImplementedError()

View File

@ -0,0 +1,59 @@
import json
import os
from pyshark.tshark.output_parser.base_parser import BaseTsharkOutputParser
try:
import ujson
USE_UJSON = True
except ImportError:
USE_UJSON = False
from pyshark.packet.layers.ek_layer import EkLayer
from pyshark.packet.packet import Packet
_ENCODED_OS_LINESEP = os.linesep.encode()
class TsharkEkJsonParser(BaseTsharkOutputParser):
def _parse_single_packet(self, packet):
return packet_from_ek_packet(packet)
def _extract_packet_from_data(self, data, got_first_packet=True):
"""Returns a packet's data and any remaining data after reading that first packet"""
start_index = 0
data = data.lstrip()
if data.startswith(b'{"ind'):
# Skip the 'index' JSONs, generated for Elastic.
# See: https://bugs.wireshark.org/bugzilla/show_bug.cgi?id=16656
start_index = data.find(_ENCODED_OS_LINESEP) + 1
linesep_location = data.find(_ENCODED_OS_LINESEP, start_index)
if linesep_location == -1:
return None, data
return data[start_index:linesep_location], data[linesep_location + 1:]
def packet_from_ek_packet(json_pkt):
if USE_UJSON:
pkt_dict = ujson.loads(json_pkt)
else:
pkt_dict = json.loads(json_pkt.decode('utf-8'))
# We use the frame dict here and not the object access because it's faster.
frame_dict = pkt_dict['layers'].pop('frame')
layers = []
for layer in frame_dict['frame_frame_protocols'].split(':'):
layer_dict = pkt_dict['layers'].pop(layer, None)
if layer_dict is not None:
layers.append(EkLayer(layer, layer_dict))
# Add all leftovers
for name, layer in pkt_dict['layers'].items():
layers.append(EkLayer(name, layer))
return Packet(layers=layers, frame_info=EkLayer('frame', frame_dict),
number=int(frame_dict.get('frame_frame_number', 0)),
length=int(frame_dict['frame_frame_len']),
sniff_time=frame_dict['frame_frame_time_epoch'],
interface_captured=frame_dict.get('rame_frame_interface_id'))

View File

@ -0,0 +1,112 @@
import json
import os
from packaging import version
from pyshark.packet.layers.json_layer import JsonLayer
from pyshark.packet.packet import Packet
from pyshark.tshark.output_parser.base_parser import BaseTsharkOutputParser
from pyshark.tshark import tshark
try:
import ujson
USE_UJSON = True
except ImportError:
USE_UJSON = False
class TsharkJsonParser(BaseTsharkOutputParser):
def __init__(self, tshark_version=None):
super().__init__()
self._tshark_version = tshark_version
def _parse_single_packet(self, packet):
json_has_duplicate_keys = tshark.tshark_supports_duplicate_keys(self._tshark_version)
return packet_from_json_packet(packet, deduplicate_fields=json_has_duplicate_keys)
def _extract_packet_from_data(self, data, got_first_packet=True):
"""Returns a packet's data and any remaining data after reading that first packet"""
tag_start = 0
if not got_first_packet:
tag_start = data.find(b"{")
if tag_start == -1:
return None, data
packet_separator, end_separator, end_tag_strip_length = self._get_json_separators()
found_separator = None
tag_end = data.find(packet_separator)
if tag_end == -1:
# Not end of packet, maybe it has end of entire file?
tag_end = data.find(end_separator)
if tag_end != -1:
found_separator = end_separator
else:
# Found a single packet, just add the separator without extras
found_separator = packet_separator
if found_separator:
tag_end += len(found_separator) - end_tag_strip_length
return data[tag_start:tag_end].strip().strip(b","), data[tag_end + 1:]
return None, data
def _get_json_separators(self):
""""Returns the separators between packets in a JSON output
Returns a tuple of (packet_separator, end_of_file_separator, characters_to_disregard).
The latter variable being the number of characters to ignore in order to pass the packet (i.e. extra newlines,
commas, parenthesis).
"""
if not self._tshark_version or self._tshark_version >= version.parse("3.0.0"):
return f"{os.linesep} }},{os.linesep}".encode(), f"}}{os.linesep}]".encode(), 1 + len(os.linesep)
else:
return f"}}{os.linesep}{os.linesep} ,".encode(), f"}}{os.linesep}{os.linesep}]".encode(), 1
def duplicate_object_hook(ordered_pairs):
"""Make lists out of duplicate keys."""
json_dict = {}
for key, val in ordered_pairs:
existing_val = json_dict.get(key)
if not existing_val:
json_dict[key] = val
else:
if isinstance(existing_val, list):
existing_val.append(val)
else:
json_dict[key] = [existing_val, val]
return json_dict
def packet_from_json_packet(json_pkt, deduplicate_fields=True):
"""Creates a Pyshark Packet from a tshark json single packet.
Before tshark 2.6, there could be duplicate keys in a packet json, which creates the need for
deduplication and slows it down significantly.
"""
if deduplicate_fields:
# NOTE: We can use ujson here for ~25% speed-up, however since we can't use hooks in ujson
# we lose the ability to view duplicates. This might still be a good option later on.
pkt_dict = json.loads(json_pkt.decode('utf-8'), object_pairs_hook=duplicate_object_hook)
else:
if USE_UJSON:
pkt_dict = ujson.loads(json_pkt)
else:
pkt_dict = json.loads(json_pkt.decode('utf-8'))
# We use the frame dict here and not the object access because it's faster.
frame_dict = pkt_dict['_source']['layers'].pop('frame')
layers = []
for layer in frame_dict['frame.protocols'].split(':'):
layer_dict = pkt_dict['_source']['layers'].pop(layer, None)
if layer_dict is not None:
layers.append(JsonLayer(layer, layer_dict))
# Add all leftovers
for name, layer in pkt_dict['_source']['layers'].items():
layers.append(JsonLayer(name, layer))
return Packet(layers=layers, frame_info=JsonLayer('frame', frame_dict),
number=int(frame_dict.get('frame.number', 0)),
length=int(frame_dict['frame.len']),
sniff_time=frame_dict['frame.time_epoch'],
interface_captured=frame_dict.get('frame.interface_id'))

View File

@ -0,0 +1,118 @@
"""This module contains functions to turn TShark XML parts into Packet objects."""
import lxml.objectify
from pyshark.packet.layers.xml_layer import XmlLayer
from pyshark.packet.packet import Packet
from pyshark.packet.packet_summary import PacketSummary
from pyshark.tshark.output_parser.base_parser import BaseTsharkOutputParser
# Prepare dictionary used with str.translate for removing invalid XML characters
DEL_BAD_XML_CHARS = {bad_char: None for bad_char in range(0x00, 0x20) if not bad_char in (0x09, 0x0a, 0x0d)}
DEL_BAD_XML_CHARS.update({bad_char: None for bad_char in range(0xd800, 0xe000)})
DEL_BAD_XML_CHARS.update({bad_char: None for bad_char in range(0xfffe, 0x10000)})
class TsharkXmlParser(BaseTsharkOutputParser):
SUMMARIES_BATCH_SIZE = 64
def __init__(self, parse_summaries=False):
super().__init__()
self._parse_summaries = parse_summaries
self._psml_structure = None
async def get_packets_from_stream(self, stream, existing_data, got_first_packet=True):
if self._parse_summaries:
existing_data = await self._get_psml_struct(stream)
return await super().get_packets_from_stream(stream, existing_data, got_first_packet=got_first_packet)
def _parse_single_packet(self, packet):
return packet_from_xml_packet(packet, psml_structure=self._psml_structure)
def _extract_packet_from_data(self, data, got_first_packet=True):
"""Gets data containing a (part of) tshark xml.
If the given tag is found in it, returns the tag data and the remaining data.
Otherwise returns None and the same data.
:param data: string of a partial tshark xml.
:return: a tuple of (tag, data). tag will be None if none is found.
"""
return _extract_tag_from_xml_data(data, tag_name=b"packet")
async def _get_psml_struct(self, fd):
"""Gets the current PSML (packet summary xml) structure in a tuple ((None, leftover_data)),
only if the capture is configured to return it, else returns (None, leftover_data).
A coroutine.
"""
initial_data = b""
psml_struct = None
# If summaries are read, we need the psdml structure which appears on top of the file.
while not psml_struct:
new_data = await fd.read(self.SUMMARIES_BATCH_SIZE)
initial_data += new_data
psml_struct, initial_data = _extract_tag_from_xml_data(initial_data, b"structure")
if psml_struct:
self._psml_structure = psml_structure_from_xml(psml_struct)
elif not new_data:
return initial_data
return initial_data
def psml_structure_from_xml(psml_structure):
if not isinstance(psml_structure, lxml.objectify.ObjectifiedElement):
psml_structure = lxml.objectify.fromstring(psml_structure)
return psml_structure.findall('section')
def packet_from_xml_packet(xml_pkt, psml_structure=None):
"""
Gets a TShark XML packet object or string, and returns a pyshark Packet objec.t
:param xml_pkt: str or xml object.
:param psml_structure: a list of the fields in each packet summary in the psml data. If given, packets will
be returned as a PacketSummary object.
:return: Packet object.
"""
if not isinstance(xml_pkt, lxml.objectify.ObjectifiedElement):
parser = lxml.objectify.makeparser(huge_tree=True, recover=True, encoding='utf-8')
xml_pkt = xml_pkt.decode(errors='ignore').translate(DEL_BAD_XML_CHARS)
xml_pkt = lxml.objectify.fromstring(xml_pkt.encode('utf-8'), parser)
if psml_structure:
return _packet_from_psml_packet(xml_pkt, psml_structure)
return _packet_from_pdml_packet(xml_pkt)
def _packet_from_psml_packet(psml_packet, structure):
return PacketSummary(structure, psml_packet.findall('section'))
def _packet_from_pdml_packet(pdml_packet):
layers = [XmlLayer(proto) for proto in pdml_packet.proto]
geninfo, frame, layers = layers[0], layers[1], layers[2:]
return Packet(layers=layers, frame_info=frame, number=geninfo.get_field_value('num'),
length=geninfo.get_field_value('len'), sniff_time=geninfo.get_field_value('timestamp', raw=True),
captured_length=geninfo.get_field_value('caplen'),
interface_captured=frame.get_field_value('interface_id', raw=True))
def _extract_tag_from_xml_data(data, tag_name=b"packet"):
"""Gets data containing a (part of) tshark xml.
If the given tag is found in it, returns the tag data and the remaining data.
Otherwise returns None and the same data.
:param data: string of a partial tshark xml.
:param tag_name: A bytes string of the tag name
:return: a tuple of (tag, data). tag will be None if none is found.
"""
opening_tag = b"<" + tag_name + b">"
closing_tag = opening_tag.replace(b"<", b"</")
tag_end = data.find(closing_tag)
if tag_end != -1:
tag_end += len(closing_tag)
tag_start = data.find(opening_tag)
return data[tag_start:tag_end], data[tag_end:]
return None, data

View File

@ -0,0 +1,169 @@
"""Module used for the actual running of TShark"""
import json
from packaging import version
import os
import subprocess
import sys
import re
from pyshark.config import get_config
class TSharkNotFoundException(Exception):
pass
class TSharkVersionException(Exception):
pass
_TSHARK_INTERFACE_ALIAS_PATTERN = re.compile(r"[0-9]*\. ([^\s]*)(?: \((.*)\))?")
def get_process_path(tshark_path=None, process_name="tshark"):
"""Finds the path of the tshark executable.
If the user has provided a path
or specified a location in config.ini it will be used. Otherwise default
locations will be searched.
:param tshark_path: Path of the tshark binary
:raises TSharkNotFoundException in case TShark is not found in any location.
"""
possible_paths = []
# Check if `config.ini` exists in the current directory or the pyshark directory
config = get_config()
if config:
possible_paths.append(config.get(process_name, f"{process_name}_path"))
# Add the user provided path to the search list
if tshark_path is not None:
user_tshark_path = os.path.join(os.path.dirname(tshark_path),
f"{process_name}.exe" if sys.platform.startswith("win") else process_name)
possible_paths.insert(0, user_tshark_path)
# Windows search order: configuration file"s path, common paths.
if sys.platform.startswith("win"):
for env in ("ProgramFiles(x86)", "ProgramFiles"):
program_files = os.getenv(env)
if program_files is not None:
possible_paths.append(
os.path.join(program_files, "Wireshark", f"{process_name}.exe")
)
# Linux, etc. search order: configuration file's path, the system's path
else:
os_path = os.getenv(
"PATH",
"/usr/bin:/usr/sbin:/usr/lib/tshark:/usr/local/bin"
)
for path in os_path.split(":"):
possible_paths.append(os.path.join(path, process_name))
if sys.platform.startswith("darwin"):
possible_paths.append(f"/Applications/Wireshark.app/Contents/MacOS/{process_name}")
for path in possible_paths:
if os.path.exists(path):
if sys.platform.startswith("win"):
path = path.replace("\\", "/")
return path
raise TSharkNotFoundException(
"TShark not found. Try adding its location to the configuration file. "
f"Searched these paths: {possible_paths}"
)
def get_tshark_version(tshark_path=None):
parameters = [get_process_path(tshark_path), "-v"]
with open(os.devnull, "w") as null:
version_output = subprocess.check_output(parameters, stderr=null).decode("ascii")
version_line = version_output.splitlines()[0]
pattern = r'.*\s(\d+\.\d+\.\d+).*' # match " #.#.#" version pattern
m = re.match(pattern, version_line)
if not m:
raise TSharkVersionException("Unable to parse TShark version from: {}".format(version_line))
version_string = m.groups()[0] # Use first match found
return version.parse(version_string)
def tshark_supports_duplicate_keys(tshark_version):
return tshark_version >= version.parse("2.6.7")
def tshark_supports_json(tshark_version):
return tshark_version >= version.parse("2.2.0")
def get_tshark_display_filter_flag(tshark_version):
"""Returns '-Y' for tshark versions >= 1.10.0 and '-R' for older versions."""
if tshark_version >= version.parse("1.10.0"):
return "-Y"
else:
return "-R"
def get_tshark_interfaces(tshark_path=None):
"""Returns a list of interface numbers from the output tshark -D.
Used internally to capture on multiple interfaces.
"""
parameters = [get_process_path(tshark_path), "-D"]
with open(os.devnull, "w") as null:
tshark_interfaces = subprocess.check_output(parameters, stderr=null).decode("utf-8")
return [line.split(" ")[1] for line in tshark_interfaces.splitlines() if '\\\\.\\' not in line]
def get_all_tshark_interfaces_names(tshark_path=None):
"""Returns a list of all possible interface names. Some interfaces may have aliases"""
parameters = [get_process_path(tshark_path), "-D"]
with open(os.devnull, "w") as null:
tshark_interfaces = subprocess.check_output(parameters, stderr=null).decode("utf-8")
all_interface_names = []
for line in tshark_interfaces.splitlines():
matches = _TSHARK_INTERFACE_ALIAS_PATTERN.findall(line)
if matches:
all_interface_names.extend([name for name in matches[0] if name])
return all_interface_names
def get_ek_field_mapping(tshark_path=None):
parameters = [get_process_path(tshark_path), "-G", "elastic-mapping"]
with open(os.devnull, "w") as null:
mapping = subprocess.check_output(parameters, stderr=null).decode("ascii")
mapping = json.loads(
mapping,
object_pairs_hook=_duplicate_object_hook)["mappings"]
# If using wireshark 4, the key "mapping" contains what we want,
if "dynamic" in mapping and "properties" in mapping:
pass
# if using wireshark 3.5 to < 4 the data in "mapping.doc",
elif "doc" in mapping:
mapping = mapping["doc"]
# or "mapping.pcap_file" if using wireshark < 3.5
elif "pcap_file" in mapping:
mapping = mapping["pcap_file"]
else:
raise TSharkVersionException(f"Your tshark version does not support elastic-mapping. Please upgrade.")
return mapping["properties"]["layers"]["properties"]
def _duplicate_object_hook(ordered_pairs):
"""Make lists out of duplicate keys."""
json_dict = {}
for key, val in ordered_pairs:
existing_val = json_dict.get(key)
if not existing_val:
json_dict[key] = val
else:
# There are duplicates without any data for some reason, if it's that - drop it
# Otherwise, override
if val.get("properties") != {}:
json_dict[key] = val
return json_dict