commit b54fb8feb9dfb2a111d15f1b759de61217d5251e Author: Gijs van Tulder Date: Fri Apr 12 23:37:45 2013 +0200 warc: Follow the guidelines for metadata records Do not use the same UUID for the manifest and arguments records. Write the manifest as a metadata record, not as a resource. diff --git a/src/ChangeLog b/src/ChangeLog index 65d636d..e609f2d 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,11 @@ +2013-04-12 Gijs van Tulder + + * warc.c: Generate unique UUIDs for the manifest and the record + holding the command-line arguments. + Write the manifest to a "metadata" record to follow the WARC + implementation guidelines. + * warc.h: Declare new function warc_write_metadata_record. + 2013-03-31 Gijs van Tulder * warc.c: Correctly write the field length in the skip length field diff --git a/src/warc.c b/src/warc.c index 9b10610..916b53d 100644 --- a/src/warc.c +++ b/src/warc.c @@ -1083,7 +1083,7 @@ warc_write_metadata (void) warc_uuid_str (manifest_uuid); fflush (warc_manifest_fp); - warc_write_resource_record (manifest_uuid, + warc_write_metadata_record (manifest_uuid, "metadata://gnu.org/software/wget/warc/MANIFEST.txt", NULL, NULL, NULL, "text/plain", warc_manifest_fp, -1); @@ -1098,9 +1098,9 @@ warc_write_metadata (void) fflush (warc_tmp_fp); fprintf (warc_tmp_fp, "%s\n", program_argstring); - warc_write_resource_record (manifest_uuid, + warc_write_resource_record (NULL, "metadata://gnu.org/software/wget/warc/wget_arguments.txt", - NULL, NULL, NULL, "text/plain", + NULL, manifest_uuid, NULL, "text/plain", warc_tmp_fp, -1); /* warc_write_resource_record has closed warc_tmp_fp. */ @@ -1395,20 +1395,22 @@ warc_write_response_record (char *url, char *timestamp_str, return warc_write_ok; } -/* Writes a resource record to the WARC file. +/* Writes a resource or metadata record to the WARC file. + warc_type is either "resource" or "metadata", resource_uuid is the uuid of the resource (or NULL), url is the target uri of the resource, timestamp_str is the timestamp (generated with warc_timestamp), - concurrent_to_uuid is the uuid of the request for that generated this + concurrent_to_uuid is the uuid of the record that generated this, resource (generated with warc_uuid_str) or NULL, ip is the ip address of the server (or NULL), content_type is the mime type of the body (or NULL), body is a pointer to a file containing the resource data. Calling this function will close body. Returns true on success, false on error. */ -bool -warc_write_resource_record (char *resource_uuid, const char *url, - const char *timestamp_str, const char *concurrent_to_uuid, +static bool +warc_write_record (const char *record_type, char *resource_uuid, + const char *url, const char *timestamp_str, + const char *concurrent_to_uuid, ip_address *ip, const char *content_type, FILE *body, off_t payload_offset) { @@ -1422,7 +1424,7 @@ warc_write_resource_record (char *resource_uuid, const char *url, content_type = "application/octet-stream"; warc_write_start_record (); - warc_write_header ("WARC-Type", "resource"); + warc_write_header ("WARC-Type", record_type); warc_write_header ("WARC-Record-ID", resource_uuid); warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str); warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid); @@ -1438,3 +1440,47 @@ warc_write_resource_record (char *resource_uuid, const char *url, return warc_write_ok; } + +/* Writes a resource record to the WARC file. + resource_uuid is the uuid of the resource (or NULL), + url is the target uri of the resource, + timestamp_str is the timestamp (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the record that generated this, + resource (generated with warc_uuid_str) or NULL, + ip is the ip address of the server (or NULL), + content_type is the mime type of the body (or NULL), + body is a pointer to a file containing the resource data. + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_resource_record (char *resource_uuid, const char *url, + const char *timestamp_str, const char *concurrent_to_uuid, + ip_address *ip, const char *content_type, FILE *body, + off_t payload_offset) +{ + return warc_write_record ("resource", + resource_uuid, url, timestamp_str, concurrent_to_uuid, + ip, content_type, body, payload_offset); +} + +/* Writes a metadata record to the WARC file. + record_uuid is the uuid of the record (or NULL), + url is the target uri of the record, + timestamp_str is the timestamp (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the record that generated this, + record (generated with warc_uuid_str) or NULL, + ip is the ip address of the server (or NULL), + content_type is the mime type of the body (or NULL), + body is a pointer to a file containing the record data. + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_metadata_record (char *record_uuid, const char *url, + const char *timestamp_str, const char *concurrent_to_uuid, + ip_address *ip, const char *content_type, FILE *body, + off_t payload_offset) +{ + return warc_write_record ("metadata", + record_uuid, url, timestamp_str, concurrent_to_uuid, + ip, content_type, body, payload_offset); +} diff --git a/src/warc.h b/src/warc.h index eba640d..45632cb 100644 --- a/src/warc.h +++ b/src/warc.h @@ -19,5 +19,8 @@ bool warc_write_response_record (char *url, char *timestamp_str, bool warc_write_resource_record (char *resource_uuid, const char *url, const char *timestamp_str, const char *concurrent_to_uuid, ip_address *ip, const char *content_type, FILE *body, off_t payload_offset); +bool warc_write_metadata_record (char *record_uuid, const char *url, + const char *timestamp_str, const char *concurrent_to_uuid, ip_address *ip, + const char *content_type, FILE *body, off_t payload_offset); #endif /* WARC_H */