>From e3100a62b8a2e31ea3458bc895002e5e537903d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81ngel=20Gonz=C3=A1lez?= Date: Wed, 14 Nov 2012 18:16:50 +0100 Subject: [PATCH 1/2] Removed most mixed declarations and code from warc.c. --- src/ChangeLog | 2 +- src/warc.c | 81 ++++++++++++++++++++++++++++++++++++----------------------- 2 files changed, 50 insertions(+), 33 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 5ab9321..713a6ba 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -3,7 +3,7 @@ gcc -std=c89 * host.c (cache_query): Fix warning about format '%p' expecting an argument of type 'void *', but being given a 'struct address_list *' - * gnutls.c, html-url.c, http.c, retr.c, main.c: Removed mixed + * gnutls.c, html-url.c, http.c, retr.c, main.c, warc.c: Removed mixed declarations and code, making more C89 compatible. 2012-11-13 Giuseppe Scrivano diff --git a/src/warc.c b/src/warc.c index 437502c..04e11e4 100644 --- a/src/warc.c +++ b/src/warc.c @@ -154,10 +154,12 @@ warc_write_buffer (const char *buffer, size_t size) static bool warc_write_string (const char *str) { + size_t n; + if (!warc_write_ok) return false; - size_t n = strlen (str); + n = strlen (str); if (n != warc_write_buffer (str, n)) warc_write_ok = false; @@ -246,6 +248,9 @@ warc_write_block_from_file (FILE *data_in) { /* Add the Content-Length header. */ char *content_length; + char buffer[BUFSIZ]; + size_t s; + fseeko (data_in, 0L, SEEK_END); if (! asprintf (&content_length, "%ld", ftello (data_in))) { @@ -262,8 +267,6 @@ warc_write_block_from_file (FILE *data_in) warc_write_ok = false; /* Copy the data in the file to the WARC record. */ - char buffer[BUFSIZ]; - size_t s; while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0) { if (warc_write_buffer (buffer, s) < s) @@ -649,13 +652,14 @@ warc_write_warcinfo_record (char *filename) /* Write warc-info record as the first record of the file. */ /* We add the record id of this info record to the other records in the file. */ + char timestamp[22]; FILE *warc_tmp; + char *filename_copy, *filename_basename; + warc_current_warcinfo_uuid_str = (char *) malloc (48); warc_uuid_str (warc_current_warcinfo_uuid_str); - char timestamp[22]; warc_timestamp (timestamp); - char *filename_copy, *filename_basename; filename_copy = strdup (filename); filename_basename = strdup (basename (filename_copy)); @@ -667,7 +671,7 @@ warc_write_warcinfo_record (char *filename) warc_write_header ("WARC-Filename", filename_basename); /* Create content. */ - FILE *warc_tmp = warc_tempfile (); + warc_tmp = warc_tempfile (); if (warc_tmp == NULL) { free (filename_copy); @@ -717,6 +721,15 @@ warc_write_warcinfo_record (char *filename) static bool warc_start_new_file (bool meta) { + int base_filename_length; + char *new_filename; + +#ifdef HAVE_LIBZ + const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc"); +#else + const char *extension = "warc"; +#endif + if (opt.warc_filename == NULL) return false; @@ -729,17 +742,11 @@ warc_start_new_file (bool meta) warc_current_file_number++; - int base_filename_length = strlen (opt.warc_filename); + base_filename_length = strlen (opt.warc_filename); /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */ - char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1); + new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1); warc_current_filename = new_filename; -#ifdef HAVE_LIBZ - const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc"); -#else - const char *extension = "warc"; -#endif - /* If max size is enabled, we add a serial number to the file names. */ if (meta) sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension); @@ -811,12 +818,13 @@ static bool warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id) { + char *token; + char *save_ptr; + *field_num_original_url = -1; *field_num_checksum = -1; *field_num_record_id = -1; - char *token; - char *save_ptr; token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr); if (token != NULL && strcmp (token, "CDX") == 0) @@ -860,10 +868,11 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, char *token; char *save_ptr; + int field_num = 0; + token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr); /* Read this line to get the fields we need. */ - int field_num = 0; while (token != NULL) { char **val; @@ -926,10 +935,6 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, static bool warc_load_cdx_dedup_file (void) { - FILE *f = fopen (opt.warc_cdx_dedup_filename, "r"); - if (f == NULL) - return false; - int field_num_original_url = -1; int field_num_checksum = -1; int field_num_record_id = -1; @@ -938,6 +943,10 @@ warc_load_cdx_dedup_file (void) size_t n = 0; ssize_t line_length; + FILE *f = fopen (opt.warc_cdx_dedup_filename, "r"); + if (f == NULL) + return false; + /* The first line should contain the CDX header. Format: " CDX x x x x x" where x are field type indicators. For our purposes, we only @@ -965,6 +974,7 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n")); } else { + int nrecords; /* Initialize the table. */ warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest); @@ -982,7 +992,7 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n")); while (line_length != -1); /* Print results. */ - int nrecords = hash_table_count (warc_cdx_dedup_table); + nrecords = hash_table_count (warc_cdx_dedup_table); logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n", "Loaded %d records from CDX.\n\n", nrecords), @@ -1002,11 +1012,12 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n")); static struct warc_cdx_record * warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload) { + struct warc_cdx_record *rec_existing; + if (warc_cdx_dedup_table == NULL) return NULL; - struct warc_cdx_record *rec_existing - = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload); + rec_existing = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload); if (rec_existing && strcmp (rec_existing->url, url) == 0) return rec_existing; @@ -1077,11 +1088,13 @@ warc_init (void) static void warc_write_metadata (void) { + char manifest_uuid [48]; + FILE *warc_tmp_fp; + /* If there are multiple WARC files, the metadata should be written to a separate file. */ if (opt.warc_maxsize > 0) warc_start_new_file (true); - char manifest_uuid [48]; warc_uuid_str (manifest_uuid); fflush (warc_manifest_fp); @@ -1091,7 +1104,7 @@ warc_write_metadata (void) warc_manifest_fp, -1); /* warc_write_resource_record has closed warc_manifest_fp. */ - FILE * warc_tmp_fp = warc_tempfile (); + warc_tmp_fp = warc_tempfile (); if (warc_tmp_fp == NULL) { logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n")); @@ -1146,10 +1159,12 @@ FILE * warc_tempfile (void) { char filename[100]; + int fd; + if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1) return NULL; - int fd = mkstemp (filename); + fd = mkstemp (filename); if (fd < 0) return NULL; @@ -1210,6 +1225,8 @@ warc_write_cdx_record (const char *url, const char *timestamp_str, { /* Transform the timestamp. */ char timestamp_str_cdx [15]; + const char *checksum;; + memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */ memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */ memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */ @@ -1219,7 +1236,6 @@ warc_write_cdx_record (const char *url, const char *timestamp_str, timestamp_str_cdx[14] = '\0'; /* Rewrite the checksum. */ - const char *checksum; if (payload_digest != NULL) checksum = payload_digest + 5; /* Skip the "sha1:" */ else @@ -1258,10 +1274,10 @@ warc_write_revisit_record (char *url, char *timestamp_str, char *refers_to, ip_address *ip, FILE *body) { char revisit_uuid [48]; - warc_uuid_str (revisit_uuid); - char *block_digest = NULL; char sha1_res_block[SHA1_DIGEST_SIZE]; + + warc_uuid_str (revisit_uuid); sha1_stream (body, sha1_res_block); block_digest = warc_base32_sha1_digest (sha1_res_block); @@ -1311,6 +1327,8 @@ warc_write_response_record (char *url, char *timestamp_str, char *payload_digest = NULL; char sha1_res_block[SHA1_DIGEST_SIZE]; char sha1_res_payload[SHA1_DIGEST_SIZE]; + char response_uuid [48]; + off_t offset; if (opt.warc_digests_enabled) { @@ -1355,11 +1373,10 @@ warc_write_response_record (char *url, char *timestamp_str, /* Not a revisit, just store the record. */ - char response_uuid [48]; warc_uuid_str (response_uuid); fseeko (warc_current_file, 0L, SEEK_END); - off_t offset = ftello (warc_current_file); + offset = ftello (warc_current_file); warc_write_start_record (); warc_write_header ("WARC-Type", "response"); -- 1.8.0